David Fuentes (dmf4ns)
DS 5001
University Of Virginia
Summer 2021
Purpose: This section contains the code used to pull the raw transcripts from Motley Fool's conference-call transcript service.
Note that this section will scrape Motley Fool's site if you run it. Depending on your Chrome settings and whether you have ChromeDriver, this may not run. Regardless, the code needs to be altered to reference ChromeDriver's location on your computer.
Data output from this section flows into Section 2 and will be included with submitted files.
# import packages
from bs4 import BeautifulSoup, SoupStrainer
from urllib.request import Request, urlopen
import pandas as pd
import time
import io
from selenium import webdriver
from ftfy import fix_encoding
Set up list of companies whose data the program will scrape:
Note: Motley Fool times out after scraping about 7 companies. Waiting some time before scraping again works well. Each company has about 2 years' worth of transcripts on the MF site.
companies = ['PH', 'SLB', 'GIS', 'RMD', 'XEL', 'MSI', 'SBAC'] # list of companies to scrape
I used S&P companies for this analysis. This markdown cell contains the remaining companies along with those I scraped for my own purposes (so I could copy and paste into the companies list each time I ran).
Remaining Companies: 'AON','ROP', 'JCI', 'BIIB', 'NEM', 'FCX', 'KMB', 'PSA', 'IQV', 'MSCI', 'KLAC', 'TT', 'A', 'TROW', 'LHX', 'EBAY', 'EXC', 'DLR', 'TEL', 'CMG', 'DOW', 'ALGN', 'GD', 'AEP', 'INFO', 'MET', 'DXCM', 'SNPS', 'ORLY', 'CNC', 'ROST', 'EOG', 'SRE', 'EA', 'BAX', 'APH', 'APTV', 'ALXN', 'CARR', 'PPG', 'DD', 'AIG', 'ALL', 'SPG', 'CDNS', 'BK', 'TRV', 'PRU', 'STZ', 'PH', 'SLB', 'GIS', 'RMD', 'XEL', 'MSI', 'SBAC', 'MCHP', 'SYY', 'DFS', 'CTSH', 'WELL', 'PAYX', 'IFF', 'MAR', 'FTNT', 'CMI', 'AZO', 'OTIS', 'MNST', 'YUM', 'ROK', 'MTD', 'KMI', 'CTAS', 'TDG', 'HPQ', 'MPC', 'AFL', 'FRC', 'WBA', 'HLT', 'SWK', 'ADM', 'XLNX', 'PXD', 'KR', 'ZBH', 'AVB', 'SWKS', 'PSX', 'FAST', 'AME', 'CTVA', 'PEG', 'AWK', 'EFX', 'GLW', 'PCAR', 'WMB', 'VRSK', 'MCK', 'WEC', 'ANSS', 'ES'
Finished: 'AAPL', 'MSFT', 'AMZN', 'FB', 'GOOGL', 'GOOG', 'BRK.B', 'TSLA', 'NVDA', 'JPM', 'JNJ', 'V', 'UNH', 'PYPL', 'HD', 'PG', 'DIS', 'MA', 'BAC', 'ADBE', 'CMCSA', 'XOM', 'NFLX', 'VZ', 'INTC', 'CRM', 'CSCO', 'PFE', 'KO', 'ABT', 'ABBV', 'PEP', 'NKE', 'T', 'TMO', 'CVX', 'ACN', 'AVGO', 'MRK', 'WMT', 'LLY','WFC', 'COST', 'TXN', 'DHR', 'MCD', 'MDT', 'QCOM','EPZM','ORCL','HON','UPS','PM','LIN','BMY','NEE','UNP','C','AMGN','SBUX','LOW','INTU','MS','BA', 'RTX','BLK','GS','AMT','IBM','AMAT','TGT','CAT', 'MMM','GE','ISRG','AXP','NOW','DE','AMD','CVS', 'SCHW','CHTR','SPGI','ANTM','ZTS','PLD','LMT', 'BKNG','FIS','LRCX','SCHW','CHTR','SPGI','ANTM','ZTS', 'PLD','LMT','BKNG','FIS','LRCX','MU','MDLZ','MO', 'CCI','ADP','SYK','TMUS','GILD','TJX','PNC','COP','CI','USB','DUK','CME','GM','TFC','COF','EL', 'EQIX','CB','FDX','CSX','BDX','MMC','ATVI', 'ILMN','CL','ICE','EW', 'SHW','NSC','ITW','SO','FISV','APD','ADSK','ADI','ETN','BSX', 'D','REGN','MCO','EMR','HUM','PGR', 'HCA','F','NXPI','IDXX','GPN','TWTR', 'HCA','F','NXPI','IDXX','GPN','TWTR','WM', 'NOC','FCX','ECL','AON','BIIB','DG','VRTX', 'AON','ROP','JCI','BIIB','NEM','FCX','KMB', 'PSA','IQV','MSCI','KLAC','TT','A','TROW','LHX','EBAY','EXC','DLR','TEL','CMG','DOW','ALGN','GD','AEP','INFO','MET','DXCM','SNPS', 'ORLY','CNC','ROST','EOG','SRE','EA','BAX','ALL', 'SPG', 'CDNS', 'BK','TRV', 'PRU', 'STZ', 'APH','APTV','ALXN','CARR','PPG','DD','AIG', 'AON','ROP','JCI','BIIB','NEM','FCX','KMB'
Open each URL from Motley Fool for each ticker in the companies list; this opens the company's dashboard page. The transcript links within the main page for each company contains call-transcript, so I use BeautifulSoup to save down any link on the page containing that sub-string (with a list comprehension). I append all the transcript URLs for each company to the same list.
%time
options = webdriver.ChromeOptions()
options.add_argument('--headless') # Option so that selenium doesn't open a new Chrome window with each pull and just works in the background
t_0 = time.time() # will use to time the process later
# input headers in a dict to bypass issue loading transcript site
hd = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) \
Chrome/23.0.1271.64 Safari/537.11 \
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/39.0.2171.95 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
# URL base, from which we pull the set of links
links = [] # empty list to which we will append the links to the transcripts
for i in companies:
loop_time = time.time()
root = 'https://www.fool.com/quote/'+str(i) # the pattern for the main company dashboard
time.sleep(6) # I was trying to use this so MF wouldn't time out, but didn't quite work
# send request to site with headers to bypass Forbidden issue
req = Request(root, headers = hd)
# read site
html_page = urlopen(req).read()
# create HTML "soup" from which we will pull our information
soup = BeautifulSoup(html_page, "lxml")
# initiate web driver for Chrome
import os
driver = webdriver.Chrome(executable_path=r'/opt/homebrew/bin/chromedriver', options=options)
#driver = webdriver.Chrome(options=options)#, executable_path = '/Users/dfuent/Desktop/chromedriver.exe')
driver.get(root)
for link in soup.findAll('a'): # locate all links
links.append(str(link.get('href'))) # append the link to the link list
# we only want the data if the link contains 'call-transcript', which is the convention used by the CPD
t = [i for i in links if 'call-transcript' in i] # list comp to get transcript links
t = list(set(t)) # using set removes any duplicates (just in case)
links.append(t)
driver.quit() # close the Chrome driver
print('Loop {0} took {1: .2f} seconds.'.format(i, time.time()-loop_time))
CPU times: user 1 µs, sys: 0 ns, total: 1 µs Wall time: 1.91 µs Loop PH took 11.68 seconds. Loop SLB took 22.53 seconds. Loop GIS took 19.41 seconds. Loop RMD took 18.86 seconds. Loop XEL took 18.27 seconds. Loop MSI took 11.99 seconds. Loop SBAC took 12.00 seconds.
The first scraper created a list of transcript links for the companies in the companies list. We can now open each transcript and pull the raw transcript data.
fin_list = [] # empty list to which we will input the transcript strings
df = pd.DataFrame(columns = ['url', 'transcript_raw']) # create a blank DF to contain transcripts
print('There are {0} transcripts to scrape.'.format(len(t)))
n = 0
for i in t: # iterate through our list of links
loop_time = time.time()
n += 1
print('{0} of {1} total transcripts.'.format(n, len(t)))
url = 'https://www.fool.com' + str(i) # create the transcript URL
#print(url) # for testing
#Option so that selenium doesn't open a new Chrome window during each loop
options = webdriver.ChromeOptions()
options.add_argument('--headless')
#initiate web driver
driver = webdriver.Chrome(executable_path=r'/opt/homebrew/bin/chromedriver', options=options)
#use driver to open url
driver.get(url)
#wait three seconds to load page, just in casse
time.sleep(3)
#extract page HTML and parse with BeautifulSoup
html=driver.page_source
soup=BeautifulSoup(html,'html.parser')
# transcript name and date are tagged with the taggings below, so we pull those data
h = soup('<span class="article-content">')
h = str(h)[1:-1].replace('<span class="article-content">', '').replace('</span', '') # replace the taggings so the data are clean
#print(h) # print the transcript for testing
tr = str(soup('p')) # pull each block (basically paragraphs and transitions between speakers) of the transcript parsed above
spl_tr = tr.split('</p>') # split the transcript into each speaker's block of text
temp = pd.DataFrame([[url, spl_tr]], columns = ['url', 'transcript_raw']) # send to a temp DF
# I want to stack the transcript into a few rows by the paragraph rather than just one long string
s = temp.apply(lambda x: pd.Series(x['transcript_raw']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'transcript'
temp = temp.join(s)
# add the data from the temp DF to the full DF
df = df.append(temp).drop('transcript_raw', axis=1)
#df.to_csv('df_rawComp_vTEST.csv') # send transcripts to a CSV for the list of companies. this is commented out because I am done saving transcripts
driver.quit()
print('{0} took {1: .2f} seconds.'.format(url, time.time()-loop_time))
There are 70 transcripts to scrape. 1 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/10/16/schlumberger-limited-slb-q3-2020-earnings-call-tra/ took 7.99 seconds. 2 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/07/24/schlumberger-limited-slb-q2-2020-earnings-call-tra.aspx took 7.81 seconds. 3 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2021/01/22/schlumberger-limited-slb-q4-2020-earnings-call-tra/ took 12.41 seconds. 4 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2021/07/29/xcel-energy-inc-xel-q2-2021-earnings-call-transcri/ took 14.92 seconds. 5 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/08/01/xcel-energy-inc-xel-q2-2019-earnings-call-transcri.aspx took 14.86 seconds. 6 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/12/17/general-mills-inc-gis-q2-2021-earnings-call-transc/ took 14.71 seconds. 7 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/12/18/general-mills-inc-gis-q2-2020-earnings-call-transc.aspx took 12.30 seconds. 8 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/05/01/resmed-inc-rmd-q3-2020-earnings-call-transcript.aspx took 12.03 seconds. 9 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/05/03/resmed-inc-rmd-q1-2019-earnings-call-transcript.aspx took 8.39 seconds. 10 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/09/18/general-mills-inc-gis-q1-2020-earnings-call-transc.aspx took 7.76 seconds. 11 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/01/30/resmed-inc-rmd-q2-2020-earnings-call-transcript.aspx took 13.55 seconds. 12 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2021/08/05/motorola-solutions-inc-msi-q2-2021-earnings-call-t/ took 13.44 seconds. 13 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/04/18/schlumberger-nv-slb-q1-2019-earnings-call-transcri.aspx took 16.33 seconds. 14 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/03/20/general-mills-inc-gis-q3-2019-earnings-conference.aspx took 12.53 seconds. 15 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2021/03/24/general-mills-inc-gis-q3-2021-earnings-call-transc/ took 16.51 seconds. 16 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/05/03/parker-hannifin-corp-ph-q3-2019-earnings-call-tran.aspx took 13.98 seconds. 17 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/05/07/xcel-energy-inc-xel-q1-2020-earnings-call-transcri.aspx took 7.90 seconds. 18 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2021/01/29/resmed-rmd-q2-2021-earnings-call-transcript/ took 14.28 seconds. 19 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2021/02/23/sba-communications-corp-sbac-q4-2020-earnings-call/ took 13.06 seconds. 20 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/10/30/xcel-energy-inc-xel-q3-2020-earnings-call-transcri/ took 13.58 seconds. 21 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/05/06/sba-communications-corp-sbac-q1-2020-earnings-call.aspx took 13.51 seconds. 22 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2021/05/07/motorola-solutions-inc-msi-q1-2021-earnings-call-t/ took 13.68 seconds. 23 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/10/28/sba-communications-corp-sbac-q3-2019-earnings-call.aspx took 14.97 seconds. 24 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/07/29/sba-communications-corp-sbac-q2-2019-earnings-call.aspx took 8.07 seconds. 25 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2021/04/30/parker-hannifin-corp-ph-q3-2021-earnings-call-tran/ took 7.90 seconds. 26 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/08/03/sba-communications-corp-sbac-q2-2020-earnings-call.aspx took 12.24 seconds. 27 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/08/06/parker-hannifin-corp-ph-q4-2020-earnings-call-tran.aspx took 14.70 seconds. 28 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/07/30/xcel-energy-inc-xel-q2-2020-earnings-call-transcri.aspx took 13.19 seconds. 29 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/04/29/sba-communications-corp-sbac-q1-2019-earnings-conf.aspx took 8.10 seconds. 30 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2021/04/29/resmed-rmd-q3-2021-earnings-call-transcript/ took 13.17 seconds. 31 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/10/18/schlumberger-nv-slb-q3-2019-earnings-call-transcri.aspx took 13.57 seconds. 32 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/10/30/resmed-rmd-q1-2021-earnings-call-transcript/ took 17.54 seconds. 33 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/10/24/resmed-inc-rmd-q1-2020-earnings-call-transcript.aspx took 14.66 seconds. 34 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/10/30/motorola-solutions-inc-msi-q3-2020-earnings-call-t/ took 15.66 seconds. 35 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/08/01/parker-hannifin-corp-ph-q4-2019-earnings-call-tran.aspx took 14.32 seconds. 36 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/01/30/xcel-energy-inc-xel-q4-2019-earnings-call-transcri.aspx took 18.02 seconds. 37 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2021/08/02/sba-communications-corporation-sbac-q2-2021-earnin/ took 8.30 seconds. 38 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/08/02/motorola-solutions-inc-msi-q2-2019-earnings-call-t.aspx took 15.61 seconds. 39 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2021/02/05/motorola-solutions-inc-msi-q4-2020-earnings-call-t/ took 14.23 seconds. 40 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/04/30/parker-hannifin-corp-ph-q3-2020-earnings-call-tran.aspx took 12.57 seconds. 41 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/01/30/parker-hannifin-corp-ph-q2-2020-earnings-call-tran.aspx took 15.70 seconds. 42 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2021/06/30/general-mills-inc-gis-q4-2021-earnings-call-transc/ took 13.58 seconds. 43 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/06/26/general-mills-inc-gis-q4-2019-earnings-call-transc.aspx took 8.04 seconds. 44 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/04/17/schlumberger-nv-slb-q1-2020-earnings-call-transcri.aspx took 13.99 seconds. 45 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/10/24/xcel-energy-inc-xel-q3-2019-earnings-call-transcri.aspx took 9.17 seconds. 46 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/02/20/sba-communications-corp-sbac-q4-2019-earnings-call.aspx took 13.67 seconds. 47 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2021/02/04/parker-hannifin-corp-ph-q2-2021-earnings-call-tran/ took 16.09 seconds. 48 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/08/07/motorola-solutions-inc-msi-q2-2020-earnings-call-t.aspx took 13.96 seconds. 49 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/11/02/sba-communications-corp-sbac-q3-2020-earnings-call/ took 15.25 seconds. 50 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/03/18/general-mills-inc-gis-q3-2020-earnings-call-transc.aspx took 15.72 seconds. 51 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2021/08/06/resmed-rmd-q4-2021-earnings-call-transcript/ took 14.38 seconds. 52 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2021/04/26/sba-communications-corp-sbac-q1-2021-earnings-call/ took 7.82 seconds. 53 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/09/23/general-mills-inc-gis-q1-2021-earnings-call-transc/ took 15.11 seconds. 54 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/02/07/motorola-solutions-inc-msi-q4-2019-earnings-call-t.aspx took 13.97 seconds. 55 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/05/03/motorola-solutions-inc-msi-q1-2019-earnings-call-t.aspx took 15.62 seconds. 56 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/07/19/schlumberger-nv-slb-q2-2019-earnings-call-transcri.aspx took 8.34 seconds. 57 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/07/01/general-mills-inc-gis-q4-2020-earnings-call-transc.aspx took 13.64 seconds. 58 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/04/25/xcel-energy-inc-xel-q1-2019-earnings-call-transcri.aspx took 13.14 seconds. 59 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2021/07/23/schlumberger-limited-slb-q2-2021-earnings-call-tra/ took 16.79 seconds. 60 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/10/30/motorola-solutions-inc-msi-q3-2019-earnings-call-t.aspx took 21.44 seconds. 61 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2021/01/28/xcel-energy-inc-xel-q4-2020-earnings-call-transcri/ took 14.38 seconds. 62 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/11/06/parker-hannifin-corp-ph-q1-2021-earnings-call-tran/ took 13.84 seconds. 63 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/07/25/resmed-inc-rmd-q4-2019-earnings-call-transcript.aspx took 24.59 seconds. 64 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2021/08/05/parker-hannifin-corporation-ph-q2-2021-earnings-ca/ took 13.43 seconds. 65 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2021/04/29/xcel-energy-inc-xel-q1-2021-earnings-call-transcri/ took 13.95 seconds. 66 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/08/06/resmed-rmd-q4-2020-earnings-call-transcript.aspx took 8.64 seconds. 67 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2019/10/31/parker-hannifin-corp-ph-q1-2020-earnings-call-tran.aspx took 16.16 seconds. 68 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/05/08/motorola-solutions-inc-msi-q1-2020-earnings-call-t.aspx took 13.26 seconds. 69 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2021/04/23/schlumberger-limited-slb-q1-2021-earnings-call-tra/ took 14.62 seconds. 70 of 70 total transcripts. https://www.fool.com/earnings/call-transcripts/2020/01/17/schlumberger-nv-slb-q4-2019-earnings-call-transcri.aspx took 13.47 seconds.
This process was run about 30 times to pull sufficient transcript data. I am sure I can figure out a more practical way to get around the time-out errors from Motley Fool (or I could have automated a batch run every few hours for a new set of companies), but I simply ran this file whenever I remembered to over a week or so, and it wasn't a problem.
Purpose: This section contains the code used to transform the raw transcript data pulled from Motley Fool
import pandas as pd
import os
import glob
import warnings
warnings.filterwarnings('ignore') # I know the code works, but I get certain deprecation warnings and warning when my models timeout
YOU NEED TO CHANGE THE PATH BELOW. The rest of the input/output should work properly.
# change to your folder structure
os.chdir('/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data') # I needed to change my directory at one pointto pickup the correct files
dir = os.getcwd()
Get file names with raw transcript data.
f = glob.glob(dir + '/raw_data_files/df_rawComp_v*.csv') # all transcript files contain this name pattern
f # list of files
['/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v8.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v9.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v25.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v19.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v18.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v24.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v30.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v26.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v27.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v23.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v22.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v20.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v21.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v10.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v11.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v13.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v12.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v16.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v17.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v29.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v15.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v14.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v28.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v4.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v5.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v7.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v6.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v2.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v3.csv', '/Users/dfuent/Desktop/Desktop - David’s MacBook Pro/UVA/project_dfuentes/proj_data/raw_data_files/df_rawComp_v1.csv']
raw_df = pd.DataFrame()
for i in f:
tmp = pd.read_csv(i)
tmp['file'] = i
raw_df = raw_df.append(tmp)
raw_df.drop('Unnamed: 0', axis = 1, inplace = True)
sp_map = pd.read_csv('sp_mapping.csv') # contains S&P info I use to map to the data
sp_map = sp_map.rename(columns = {'Symbol':'ticker', 'GICS Sector':'sector', 'GICS Sub-Industry':'sub_sector'}).set_index('ticker')
View the raw data and save as F0:
raw_df
| url | transcript | file | |
|---|---|---|---|
| 0 | https://www.fool.com/earnings/call-transcripts... | [<p align="center"><small><em>Returns as of 7/... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 1 | https://www.fool.com/earnings/call-transcripts... | , <p align="center"><small><em>Returns as of 7... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 2 | https://www.fool.com/earnings/call-transcripts... | , <p>Founded in 1993 by brothers Tom and David... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 3 | https://www.fool.com/earnings/call-transcripts... | , <p class="caption">Image source: The Motley ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 4 | https://www.fool.com/earnings/call-transcripts... | , <p><strong>American Tower Corp</strong> <sp... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| ... | ... | ... | ... |
| 15995 | https://www.fool.com/earnings/call-transcripts... | , <p class="launch-disclaimer" data-uw-rm-sr="... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 15996 | https://www.fool.com/earnings/call-transcripts... | , <p class="launch-info">Stock Advisor launche... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 15997 | https://www.fool.com/earnings/call-transcripts... | , <p class="copyright" id="footer-copyright-te... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 15998 | https://www.fool.com/earnings/call-transcripts... | , <p>\n Market data powered b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 15999 | https://www.fool.com/earnings/call-transcripts... | ] | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
551778 rows × 3 columns
raw_df.to_csv('../proj_data/output_data/F0_rawAll.csv') # F0 file
The transcripts contain many of the same lines for things like advertisements and other boilerplate items. I want to get ride of these.
raw_df.groupby('transcript').count().sort_values('file', ascending = False).head(25) # see top 25 most common lines
| url | file | |
|---|---|---|
| transcript | ||
| , <p><strong>Operator</strong> | 27380 | 27380 |
| , <p> | 5762 | 5762 |
| , <p>Thank you. | 3756 | 3756 |
| ] | 2117 | 2117 |
| , <p class="launch-disclaimer" data-uw-rm-sr="">Discounted offers are only available to new members. Stock Advisor will renew at the then current list price. Stock Advisor list price is $199 per year. | 2117 | 2117 |
| , <p>Founded in 1993 by brothers Tom and David Gardner, The Motley Fool helps millions of people attain financial freedom through our website, podcasts, books, newspaper column, radio show, and premium investing services. | 2117 | 2117 |
| , <p class="copyright" id="footer-copyright-text">© 1995 - 2021 The Motley Fool. All rights reserved.<br data-uw-rm-sr="" role="presentation"/> | 2117 | 2117 |
| , <p><em><span style="color: #767676; font-size: 8pt;">*Stock Advisor returns as of June 7, 2021</span></em> | 2115 | 2115 |
| , <p class="caption">Image source: The Motley Fool. | 2101 | 2101 |
| , <p>\n Market data powered by Xignite.\n | 2043 | 2043 |
| , <p><em>This article is a transcript of this conference call produced for The Motley Fool. While we strive for our Foolish Best, there may be errors, omissions, or inaccuracies in this transcript. As with all our articles, The Motley Fool does not assume any responsibility for your use of this content, and we strongly encourage you to do your own research, including listening to the call yourself and reading the company's SEC filings. Please see our </em><a data-uw-rm-brl="false" href="https://www.fool.com/legal/terms-and-conditions/fool-rules"><em>Terms and Conditions</em></a><em> for additional details, including our Obligatory Capitalized Disclaimers of Liability.</em> | 2035 | 2035 |
| , <p><a data-uw-rm-brl="false" href="https://www.fool.com/earnings-call-transcripts/">All earnings call transcripts</a> | 1668 | 1668 |
| , <p>Good morning. | 1048 | 1048 |
| , <p>[Operator Closing Remarks] | 996 | 996 |
| , <p><strong>Richard C. Adkerson</strong> -- <em data-uw-rm-mod="">Vice Chairman, President and Chief Executive Officer</em> | 798 | 798 |
| , <p align="center"><small><em>Returns as of 7/14/2021</em></small> | 635 | 635 |
| [<p align="center"><small><em>Returns as of 7/14/2021</em></small> | 635 | 635 |
| , <p>Thanks. | 627 | 627 |
| , <p><i><a data-uw-rm-brl="false" href="http://boards.fool.com/profile/MFTranscribers/info.aspx">Motley Fool Transcribers</a> has no position in any of the stocks mentioned. The Motley Fool has no position in any of the stocks mentioned. The Motley Fool has a <a data-uw-rm-brl="false" href="http://www.fool.com/Legal/fool-disclosure-policy.aspx">disclosure policy</a>.</i> | 622 | 622 |
| , <p>\n<a aria-label="AlphaStreet Logo - opens in new tab" data-uw-rm-brl="false" data-uw-rm-ext-link="" href="https://www.alphastreet.com" rel="nofollow noopener norefferer" target="_blank" title="AlphaStreet Logo - opens in new tab" uw-rm-external-link-id="https://www.alphastreet.com/$alphastreetlogo">\n<img alt="AlphaStreet Logo" class="original-image-src" data-uw-rm-ima-original="alphastreet logo" src="https://g.foolcdn.com/misc-assets/banner-2b-ff.jpg"/>\n</a>\n | 603 | 603 |
| , <p class="launch-info">Stock Advisor launched in February of 2002. Returns as of 07/14/2021. | 519 | 519 |
| , <p><strong>Greg Case</strong> -- <em>Chief Executive Officer</em> | 489 | 489 |
| , <p><strong>Kathleen L. Quirk</strong> -- <em>Executive Vice President and Chief Financial Officer</em> | 485 | 485 |
| , <p>Great. Thank you. | 476 | 476 |
| [<p align="center"><small><em>Returns as of 7/13/2021</em></small> | 459 | 459 |
check = raw_df.groupby('transcript').count().sort_values('file', ascending = False)
As an example, check some of the lines that contain Motley Fool, which I know should not be mentioned in any transcripts.
check[check.index.str.contains('Motley Fool')]
| url | file | |
|---|---|---|
| transcript | ||
| , <p>Founded in 1993 by brothers Tom and David Gardner, The Motley Fool helps millions of people attain financial freedom through our website, podcasts, books, newspaper column, radio show, and premium investing services. | 2117 | 2117 |
| , <p class="copyright" id="footer-copyright-text">© 1995 - 2021 The Motley Fool. All rights reserved.<br data-uw-rm-sr="" role="presentation"/> | 2117 | 2117 |
| , <p class="caption">Image source: The Motley Fool. | 2101 | 2101 |
| , <p><em>This article is a transcript of this conference call produced for The Motley Fool. While we strive for our Foolish Best, there may be errors, omissions, or inaccuracies in this transcript. As with all our articles, The Motley Fool does not assume any responsibility for your use of this content, and we strongly encourage you to do your own research, including listening to the call yourself and reading the company's SEC filings. Please see our </em><a data-uw-rm-brl="false" href="https://www.fool.com/legal/terms-and-conditions/fool-rules"><em>Terms and Conditions</em></a><em> for additional details, including our Obligatory Capitalized Disclaimers of Liability.</em> | 2035 | 2035 |
| , <p><i><a data-uw-rm-brl="false" href="http://boards.fool.com/profile/MFTranscribers/info.aspx">Motley Fool Transcribers</a> has no position in any of the stocks mentioned. The Motley Fool has no position in any of the stocks mentioned. The Motley Fool has a <a data-uw-rm-brl="false" href="http://www.fool.com/Legal/fool-disclosure-policy.aspx">disclosure policy</a>.</i> | 622 | 622 |
| ... | ... | ... |
| , <p><em><a data-uw-rm-brl="false" href="http://boards.fool.com/profile/MFTranscribers/info.aspx">Motley Fool Transcribers</a> has no position in any of the stocks mentioned. The Motley Fool owns shares of Texas Instruments. The Motley Fool has a <a data-uw-rm-brl="false" href="http://www.fool.com/Legal/fool-disclosure-policy.aspx">disclosure policy</a>.</em> | 1 | 1 |
| , <p><em><a data-uw-rm-brl="false" href="http://boards.fool.com/profile/MFTranscribers/info.aspx">Motley Fool Transcribers</a> has no position in any of the stocks mentioned. The Motley Fool owns shares of and recommends Accenture. The Motley Fool has a <a data-uw-rm-brl="false" href="http://www.fool.com/Legal/fool-disclosure-policy.aspx">disclosure policy</a>.</em> | 1 | 1 |
| , <p><em><a data-uw-rm-brl="false" href="http://boards.fool.com/profile/MFTranscribers/info.aspx">Motley Fool Transcribers</a> has no position in any of the stocks mentioned. The Motley Fool owns shares of and recommends Adobe Systems. The Motley Fool has a <a data-uw-rm-brl="false" href="http://www.fool.com/Legal/fool-disclosure-policy.aspx">disclosure policy</a>.</em> | 1 | 1 |
| , <p><em><a data-uw-rm-brl="false" href="http://boards.fool.com/profile/MFTranscribers/info.aspx">Motley Fool Transcribers</a> has no position in any of the stocks mentioned. The Motley Fool owns shares of and recommends Align Technology. The Motley Fool has a <a data-uw-rm-brl="false" href="http://www.fool.com/Legal/fool-disclosure-policy.aspx">disclosure policy</a>.</em> | 1 | 1 |
| , <p><em><a data-uw-rm-brl="false" href="http://boards.fool.com/profile/MFTranscribers/info.aspx">Motley Fool Transcribers</a> has no position in any of the stocks mentioned. The Motley Fool owns shares of and recommends American Tower. The Motley Fool has a <a data-uw-rm-brl="false" href="http://www.fool.com/Legal/fool-disclosure-policy.aspx">disclosure policy</a>.</em> | 1 | 1 |
720 rows × 2 columns
Create a list of these common transcript lines we want to remove.
top_remarks = list(raw_df.groupby('transcript').count().sort_values('file', ascending = False).head(30).index)
top_remarks = [i for i in top_remarks if ('OPERATOR' not in i.upper())
& ('--' not in i) & ('GOOD MORNING' not in i.upper())
& ('THANK YOU' not in i.upper())]
Remove lines that likely aren't transcript data:
raw_df = raw_df[~raw_df.transcript.isin(top_remarks)]
raw_df
| url | transcript | file | |
|---|---|---|---|
| 4 | https://www.fool.com/earnings/call-transcripts... | , <p><strong>American Tower Corp</strong> <sp... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 5 | https://www.fool.com/earnings/call-transcripts... | , <p><strong>Operator</strong> | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 6 | https://www.fool.com/earnings/call-transcripts... | , <p>Ladies and gentlemen, thank you for stand... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 7 | https://www.fool.com/earnings/call-transcripts... | , <p><strong>Igor Khislavsky</strong> -- <em>V... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 8 | https://www.fool.com/earnings/call-transcripts... | , <p>Thanks, Kevin. Good morning and thank you... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| ... | ... | ... | ... |
| 15988 | https://www.fool.com/earnings/call-transcripts... | , <p>They just revealed what they believe are ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 15989 | https://www.fool.com/earnings/call-transcripts... | , <p><a class="ticker_pitch" data-uw-rm-brl="f... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 15994 | https://www.fool.com/earnings/call-transcripts... | , <p><i data-uw-rm-sr="">Teresa Kersten, an em... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 15996 | https://www.fool.com/earnings/call-transcripts... | , <p class="launch-info">Stock Advisor launche... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 15998 | https://www.fool.com/earnings/call-transcripts... | , <p>\n Market data powered b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
521702 rows × 3 columns
#raw_df = raw_df[~raw_df.index.isin(top_remarks)]
df = raw_df[(~raw_df['transcript'].str.contains('Motley Fool'))
& ~raw_df['transcript'].str.contains('AlphaStreet')
& ~raw_df['transcript'].str.contains('Stock Advisor launched')
& ~raw_df['transcript'].str.contains('Returns as of 7/(.+)/2021')]
df
| url | transcript | file | |
|---|---|---|---|
| 4 | https://www.fool.com/earnings/call-transcripts... | , <p><strong>American Tower Corp</strong> <sp... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 5 | https://www.fool.com/earnings/call-transcripts... | , <p><strong>Operator</strong> | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 6 | https://www.fool.com/earnings/call-transcripts... | , <p>Ladies and gentlemen, thank you for stand... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 7 | https://www.fool.com/earnings/call-transcripts... | , <p><strong>Igor Khislavsky</strong> -- <em>V... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 8 | https://www.fool.com/earnings/call-transcripts... | , <p>Thanks, Kevin. Good morning and thank you... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| ... | ... | ... | ... |
| 15983 | https://www.fool.com/earnings/call-transcripts... | , <p><strong>Philip Winslow</strong> -- <em>We... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 15984 | https://www.fool.com/earnings/call-transcripts... | , <p><a data-uw-rm-brl="false" href="https://w... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 15988 | https://www.fool.com/earnings/call-transcripts... | , <p>They just revealed what they believe are ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 15989 | https://www.fool.com/earnings/call-transcripts... | , <p><a class="ticker_pitch" data-uw-rm-brl="f... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
| 15998 | https://www.fool.com/earnings/call-transcripts... | , <p>\n Market data powered b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... |
513722 rows × 3 columns
df.groupby('transcript').count().sort_values('url', ascending = False).head(50)
| url | file | |
|---|---|---|
| transcript | ||
| , <p><strong>Operator</strong> | 27380 | 27380 |
| , <p>Thank you. | 3756 | 3756 |
| , <p>Good morning. | 1048 | 1048 |
| , <p>[Operator Closing Remarks] | 996 | 996 |
| , <p><strong>Richard C. Adkerson</strong> -- <em data-uw-rm-mod="">Vice Chairman, President and Chief Executive Officer</em> | 798 | 798 |
| , <p><strong>Greg Case</strong> -- <em>Chief Executive Officer</em> | 489 | 489 |
| , <p><strong>Kathleen L. Quirk</strong> -- <em>Executive Vice President and Chief Financial Officer</em> | 485 | 485 |
| , <p>Great. Thank you. | 476 | 476 |
| , <p><strong>David N. Farr</strong> -- <em data-uw-rm-mod="">Chairman and Chief Executive Officer</em> | 421 | 421 |
| , <p><strong>Doug Bettinger</strong> -- <em>Executive Vice President and Chief Financial Officer</em> | 420 | 420 |
| , <p>[Operator signoff] | 407 | 407 |
| , <p><strong>John Pitzer</strong> -- <em>Credit Suisse -- Analyst</em> | 391 | 391 |
| , <p><strong>Betsy Graseck</strong> -- <em>Morgan Stanley -- Analyst</em> | 381 | 381 |
| , <p><strong>Neil Hunn</strong> -- <em>President and Chief Executive Officer</em> | 366 | 366 |
| , <p><strong>Mike McMullen</strong> -- <em>President and Chief Executive Officer</em> | 359 | 359 |
| , <p><strong>Gail Boudreaux</strong> -- <em>President and Chief Executive Officer</em> | 346 | 346 |
| , <p><strong>Thomas A. Fanning</strong> -- <em data-uw-rm-mod="">Chairman, President and Chief Executive Officer</em> | 342 | 342 |
| , <p><strong>John G. Morikis</strong> -- <em data-uw-rm-mod="">Chairman and Chief Executive Officer</em> | 341 | 341 |
| , <p><strong>Sasan Goodarzi</strong> -- <em>Chief Executive Officer</em> | 336 | 336 |
| , <p><strong>Nigel Coe</strong> -- <em>Wolfe Research -- Analyst</em> | 332 | 332 |
| , <p><strong>Scott Davis</strong> -- <em>Melius Research -- Analyst</em> | 328 | 328 |
| , <p><strong>Michael H. McGarry</strong> -- <em data-uw-rm-mod="">Chairman and Chief Executive Officer</em> | 327 | 327 |
| , <p><strong>Jeffrey A. Stoops</strong> -- <em>Director, President and Chief Executive Officer</em> | 322 | 322 |
| , <p><strong>Mark Kimbrough</strong> -- <em>Vice President of Investor Relations</em> | 320 | 320 |
| , <p>Sure. | 315 | 315 |
| , <p>Okay. Thank you. | 304 | 304 |
| , <p><strong>Mark Mason</strong> -- <em>Chief Financial Officer</em> | 303 | 303 |
| , <p>Great, thank you. | 301 | 301 |
| , <p>Thank you very much. | 299 | 299 |
| , <p><strong>Darius Adamczyk</strong> -- <em data-uw-rm-mod="">Chairman and Chief Executive Officer</em> | 296 | 296 |
| , <p><strong>Julian Mitchell</strong> -- <em>Barclays -- Analyst</em> | 292 | 292 |
| , <p><strong>Michael D. Hsu</strong> -- <em data-uw-rm-mod="">Chairman and Chief Executive Officer</em> | 291 | 291 |
| , <p><strong>Jeffrey W. Martin</strong> -- <em data-uw-rm-mod="">Chairman and Chief Executive Officer</em> | 289 | 289 |
| , <p><strong>Lisa Su</strong> -- <em>President and Chief Executive Officer</em> | 283 | 283 |
| , <p><strong>Glenn D. Fogel</strong> -- <em>Chief Executive Officer and President</em> | 282 | 282 |
| , <p>Okay. | 281 | 281 |
| , <p><strong>Thomas M. Rutledge</strong> -- <em data-uw-rm-mod="">Chairman and Chief Executive Officer</em> | 280 | 280 |
| , <p><strong>Tim Archer</strong> -- <em>President and Chief Executive Officer</em> | 278 | 278 |
| , <p><strong>Seifi Ghasemi</strong> -- <em data-uw-rm-mod="">Chairman, President and Chief Executive Officer</em> | 276 | 276 |
| , <p><strong>Vijay Kumar</strong> -- <em>Evercore ISI -- Analyst</em> | 270 | 270 |
| , <p><strong>Gary Norcross</strong> -- <em data-uw-rm-mod="">Chairman, President and Chief Executive Officer</em> | 268 | 268 |
| , <p><strong>Christopher L. Winfrey</strong> -- <em>Chief Financial Officer</em> | 264 | 264 |
| , <p><strong>George Oliver</strong> -- <em data-uw-rm-mod="">Chairman and Chief Executive Officer</em> | 262 | 262 |
| , <p><strong>Michel Vounatsos</strong> -- <em>Chief Executive Officer</em> | 259 | 259 |
| , <p>Okay, thank you. | 255 | 255 |
| , <p><strong>Michael A. Mussallem</strong> -- <em data-uw-rm-mod="">Chairman and Chief Executive Officer</em> | 253 | 253 |
| , <p><strong>Joe Ritchie</strong> -- <em>Goldman Sachs -- Analyst</em> | 247 | 247 |
| , <p><strong>Olivier Le Peuch</strong> -- <em>Chief Executive Officer</em> | 245 | 245 |
| , <p><strong>Timothy Arcuri</strong> -- <em>UBS -- Analyst</em> | 243 | 243 |
| , <p><strong>David Lewis</strong> -- <em>Morgan Stanley -- Analyst</em> | 243 | 243 |
We now have a decently clean set of transcripts without the boilerplate/common lines that we aren't interested in. I will continue to clean these data, but this was a good first pass.
import re
#function to cleanup HTML
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>') # get anything between < >
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
Identify lines containing a speaker name.
pos_names = df.transcript[df.transcript.str.contains('--') & df.transcript.str.contains('em')].apply(lambda x: cleanhtml(x).strip())\
.apply(lambda x: re.sub(r'^[\[\,]', '', x).strip())
speaker_names = [i for i in list(pos_names) if len(i.split())<=15] # should mostly be lines containing a speaker
#speaker_names
Clean up HTML tags and produce clean transcript data:
df['transcript'] = df['transcript'].apply(lambda x: cleanhtml(x).strip())\
.apply(lambda x: re.sub(r'^[\[\,]', '', x).strip())
I now have some cleansed transcript data. I want to start adding some helpful columns to my table:
Add an identifier in the table labeling where the transcript data changes from one company to another.
import numpy as np
df['co_id'] = 0 # should really be named 'Call ID'. This changes with each new call
# identify the lines in the DF where the URL changes (if it changes, I know it's a new quarterly call)
# add one to prior row's value. Now there is an integer ID for each conference call
for i in range(0, len(df)):
if i == 0:
df['co_id'].iloc[i] = 0
elif df['url'].iloc[i] != df['url'].iloc[i-1]:
df['co_id'].iloc[i] = df['co_id'].iloc[i-1] + 1
else:
df['co_id'].iloc[i] = df['co_id'].iloc[i-1]
#print('worked')
Add an iterator to identify each line in each of the transcripts. Mostly the lines represent changes in paragraphs. I later add a column identifying speaker handoff as well.
df['co_count'] = 0
for i in range(0, len(df)):
if i == 0:
df['co_count'].iloc[i] = 0
elif df['co_id'].iloc[i] == df['co_id'].iloc[i-1]:
df['co_count'].iloc[i] = df['co_count'].iloc[i-1] + 1
else:
df['co_count'].iloc[i] = 0
#print('worked')
df # see current DF
| url | transcript | file | co_id | co_count | |
|---|---|---|---|---|---|
| 4 | https://www.fool.com/earnings/call-transcripts... | American Tower Corp (NYSE:AMT)Q1 2019 Earning... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 0 |
| 5 | https://www.fool.com/earnings/call-transcripts... | Operator | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 1 |
| 6 | https://www.fool.com/earnings/call-transcripts... | Ladies and gentlemen, thank you for standing b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 2 |
| 7 | https://www.fool.com/earnings/call-transcripts... | Igor Khislavsky -- Vice President, Investor Re... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 3 |
| 8 | https://www.fool.com/earnings/call-transcripts... | Thanks, Kevin. Good morning and thank you for ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 4 |
| ... | ... | ... | ... | ... | ... |
| 15983 | https://www.fool.com/earnings/call-transcripts... | Philip Winslow -- Wells Fargo Securities -- An... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 195 |
| 15984 | https://www.fool.com/earnings/call-transcripts... | More MSFT analysis | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 196 |
| 15988 | https://www.fool.com/earnings/call-transcripts... | They just revealed what they believe are the t... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 197 |
| 15989 | https://www.fool.com/earnings/call-transcripts... | See the 10 stocks | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 198 |
| 15998 | https://www.fool.com/earnings/call-transcripts... | Market data powered by FactSet and Web Financi... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 199 |
513722 rows × 5 columns
#df[df['co_count']==0].to_csv('companies.csv') # save DF to CSV
Add company name to the DF. I later map to my S&P table on ticker for consistency (not all transcripts used the same naming convention for a company).
df['co_name'] = ''
df['co_name'] = np.where(df['co_count'] == 0, df['transcript'].str.split('(').tolist(), np.nan) # pattern to get company name
co_map = df.co_name[df.co_count == 0].apply(lambda x: x[0].strip())
df[df['co_count'] == 0]
| url | transcript | file | co_id | co_count | co_name | |
|---|---|---|---|---|---|---|
| 4 | https://www.fool.com/earnings/call-transcripts... | American Tower Corp (NYSE:AMT)Q1 2019 Earning... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 0 | [American Tower Corp , NYSE:AMT)Q1 2019 Earni... |
| 237 | https://www.fool.com/earnings/call-transcripts... | United Technologies Corp (NYSE:RTX)Q4 2018 Ea... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 1 | 0 | [United Technologies Corp , NYSE:RTX)Q4 2018 ... |
| 531 | https://www.fool.com/earnings/call-transcripts... | IBM (NYSE:IBM)Q3 2020 Earnings CallOct 19, 202... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2 | 0 | [IBM , NYSE:IBM)Q3 2020 Earnings CallOct 19, 2... |
| 762 | https://www.fool.com/earnings/call-transcripts... | IBM (NYSE:IBM)Q1 2021 Earnings CallApr 19, 202... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 3 | 0 | [IBM , NYSE:IBM)Q1 2021 Earnings CallApr 19, 2... |
| 1005 | https://www.fool.com/earnings/call-transcripts... | IBM (NYSE:IBM)Q2 2020 Earnings CallJul 20, 202... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 4 | 0 | [IBM , NYSE:IBM)Q2 2020 Earnings CallJul 20, 2... |
| ... | ... | ... | ... | ... | ... | ... |
| 14882 | https://www.fool.com/earnings/call-transcripts... | Facebook Inc (NASDAQ:FB)Q1 2020 Earnings CallA... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2112 | 0 | [Facebook Inc , NASDAQ:FB)Q1 2020 Earnings Cal... |
| 15061 | https://www.fool.com/earnings/call-transcripts... | Tesla, Inc. (NASDAQ:TSLA)Q3 2019 Earnings Call... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2113 | 0 | [Tesla, Inc. , NASDAQ:TSLA)Q3 2019 Earnings Ca... |
| 15384 | https://www.fool.com/earnings/call-transcripts... | NVIDIA Corp (NASDAQ:NVDA)Q2 2020 Earnings Call... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 0 | [NVIDIA Corp , NASDAQ:NVDA)Q2 2020 Earnings Ca... |
| 15557 | https://www.fool.com/earnings/call-transcripts... | Apple (NASDAQ:AAPL)Q4 2020 Earnings CallOct 29... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2115 | 0 | [Apple , NASDAQ:AAPL)Q4 2020 Earnings CallOct ... |
| 15787 | https://www.fool.com/earnings/call-transcripts... | Microsoft Corp (NASDAQ:MSFT)Q2 2020 Earnings C... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 0 | [Microsoft Corp , NASDAQ:MSFT)Q2 2020 Earnings... |
2117 rows × 6 columns
df['co_name'] = np.where(df['co_count'] == 0, df['co_name'].apply(lambda x: x[0] if type(x) == list else x), np.nan)
df['co_name'] = df['co_name'].ffill()
df
| url | transcript | file | co_id | co_count | co_name | |
|---|---|---|---|---|---|---|
| 4 | https://www.fool.com/earnings/call-transcripts... | American Tower Corp (NYSE:AMT)Q1 2019 Earning... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 0 | American Tower Corp |
| 5 | https://www.fool.com/earnings/call-transcripts... | Operator | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 1 | American Tower Corp |
| 6 | https://www.fool.com/earnings/call-transcripts... | Ladies and gentlemen, thank you for standing b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 2 | American Tower Corp |
| 7 | https://www.fool.com/earnings/call-transcripts... | Igor Khislavsky -- Vice President, Investor Re... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 3 | American Tower Corp |
| 8 | https://www.fool.com/earnings/call-transcripts... | Thanks, Kevin. Good morning and thank you for ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 4 | American Tower Corp |
| ... | ... | ... | ... | ... | ... | ... |
| 15983 | https://www.fool.com/earnings/call-transcripts... | Philip Winslow -- Wells Fargo Securities -- An... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 195 | Microsoft Corp |
| 15984 | https://www.fool.com/earnings/call-transcripts... | More MSFT analysis | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 196 | Microsoft Corp |
| 15988 | https://www.fool.com/earnings/call-transcripts... | They just revealed what they believe are the t... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 197 | Microsoft Corp |
| 15989 | https://www.fool.com/earnings/call-transcripts... | See the 10 stocks | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 198 | Microsoft Corp |
| 15998 | https://www.fool.com/earnings/call-transcripts... | Market data powered by FactSet and Web Financi... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 199 | Microsoft Corp |
513722 rows × 6 columns
I now have the company name. Get ticker from URL as well.
df['ticker_full'] = ''
df['ticker_full'] = np.where(df['co_count'] == 0, df['transcript'].str.findall('\((.*)\)'), np.nan)
df
| url | transcript | file | co_id | co_count | co_name | ticker_full | |
|---|---|---|---|---|---|---|---|
| 4 | https://www.fool.com/earnings/call-transcripts... | American Tower Corp (NYSE:AMT)Q1 2019 Earning... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 0 | American Tower Corp | [NYSE:AMT] |
| 5 | https://www.fool.com/earnings/call-transcripts... | Operator | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 1 | American Tower Corp | NaN |
| 6 | https://www.fool.com/earnings/call-transcripts... | Ladies and gentlemen, thank you for standing b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 2 | American Tower Corp | NaN |
| 7 | https://www.fool.com/earnings/call-transcripts... | Igor Khislavsky -- Vice President, Investor Re... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 3 | American Tower Corp | NaN |
| 8 | https://www.fool.com/earnings/call-transcripts... | Thanks, Kevin. Good morning and thank you for ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 4 | American Tower Corp | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 15983 | https://www.fool.com/earnings/call-transcripts... | Philip Winslow -- Wells Fargo Securities -- An... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 195 | Microsoft Corp | NaN |
| 15984 | https://www.fool.com/earnings/call-transcripts... | More MSFT analysis | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 196 | Microsoft Corp | NaN |
| 15988 | https://www.fool.com/earnings/call-transcripts... | They just revealed what they believe are the t... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 197 | Microsoft Corp | NaN |
| 15989 | https://www.fool.com/earnings/call-transcripts... | See the 10 stocks | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 198 | Microsoft Corp | NaN |
| 15998 | https://www.fool.com/earnings/call-transcripts... | Market data powered by FactSet and Web Financi... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 199 | Microsoft Corp | NaN |
513722 rows × 7 columns
df['ticker_full'] = np.where(df['co_count'] == 0, df['ticker_full'].apply(lambda x: x[0] if type(x) == list and len(x) > 0 else x), np.nan)
df[df.ticker_full.str.len() == 0]
| url | transcript | file | co_id | co_count | co_name | ticker_full | |
|---|---|---|---|---|---|---|---|
| 7917 | https://www.fool.com/earnings/call-transcripts... | NVIDIA Corporation (NASDAQ: NVDAQ3 2019 Earnin... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2079 | 0 | NVIDIA Corporation | [] |
df['ticker_full'] = np.where(df.ticker_full.str.len() == 0, 'NASDAQ:NVDA', df['ticker_full']) # fix NVDA
df[df.ticker_full.str.len() == 0]
| url | transcript | file | co_id | co_count | co_name | ticker_full |
|---|
df.groupby('ticker_full').count()
| url | transcript | file | co_id | co_count | co_name | |
|---|---|---|---|---|---|---|
| ticker_full | ||||||
| ?????? : MSI | 1 | 1 | 1 | 1 | 1 | 1 |
| ?????? : SO | 1 | 1 | 1 | 1 | 1 | 1 |
| NASDAQ:AAPL | 10 | 10 | 10 | 10 | 10 | 10 |
| NASDAQ:ADBE | 7 | 7 | 7 | 7 | 7 | 7 |
| NASDAQ:ADI | 10 | 10 | 10 | 10 | 10 | 10 |
| ... | ... | ... | ... | ... | ... | ... |
| NYSE:WMT | 4 | 4 | 4 | 4 | 4 | 4 |
| NYSE:XOM | 10 | 10 | 10 | 10 | 10 | 10 |
| NYSE:ZTS | 20 | 20 | 20 | 20 | 20 | 20 |
| New) (NYSE:APH | 1 | 1 | 1 | 1 | 1 | 1 |
| New) (NYSE:APH | 5 | 5 | 5 | 5 | 5 | 5 |
209 rows × 6 columns
df['ticker_full'] = df['ticker_full'].ffill() # forward fill the tickers
df['ticker_full'] = np.where(df['ticker_full'].str.contains('GOOG'), 'NASDAQ:GOOGL', df['ticker_full'])
df['ticker'] = df['ticker_full'].apply(lambda x: x.split(':')[1]) # fix the ticker from full
Tickers are in with some of the odd cases fixed. Now add the date of the calls:
df['date'] = df['url'].str.split('call-transcripts/').map(lambda r: r[1]).str.split('/')\
.map(lambda r: r[:3]).map(lambda r: '-'.join(r))
df
| url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | |
|---|---|---|---|---|---|---|---|---|---|
| 4 | https://www.fool.com/earnings/call-transcripts... | American Tower Corp (NYSE:AMT)Q1 2019 Earning... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 0 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 |
| 5 | https://www.fool.com/earnings/call-transcripts... | Operator | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 1 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 |
| 6 | https://www.fool.com/earnings/call-transcripts... | Ladies and gentlemen, thank you for standing b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 2 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 |
| 7 | https://www.fool.com/earnings/call-transcripts... | Igor Khislavsky -- Vice President, Investor Re... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 3 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 |
| 8 | https://www.fool.com/earnings/call-transcripts... | Thanks, Kevin. Good morning and thank you for ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 4 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15983 | https://www.fool.com/earnings/call-transcripts... | Philip Winslow -- Wells Fargo Securities -- An... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 195 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 |
| 15984 | https://www.fool.com/earnings/call-transcripts... | More MSFT analysis | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 196 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 |
| 15988 | https://www.fool.com/earnings/call-transcripts... | They just revealed what they believe are the t... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 197 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 |
| 15989 | https://www.fool.com/earnings/call-transcripts... | See the 10 stocks | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 198 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 |
| 15998 | https://www.fool.com/earnings/call-transcripts... | Market data powered by FactSet and Web Financi... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 199 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 |
513722 rows × 9 columns
df
| url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | |
|---|---|---|---|---|---|---|---|---|---|
| 4 | https://www.fool.com/earnings/call-transcripts... | American Tower Corp (NYSE:AMT)Q1 2019 Earning... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 0 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 |
| 5 | https://www.fool.com/earnings/call-transcripts... | Operator | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 1 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 |
| 6 | https://www.fool.com/earnings/call-transcripts... | Ladies and gentlemen, thank you for standing b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 2 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 |
| 7 | https://www.fool.com/earnings/call-transcripts... | Igor Khislavsky -- Vice President, Investor Re... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 3 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 |
| 8 | https://www.fool.com/earnings/call-transcripts... | Thanks, Kevin. Good morning and thank you for ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 4 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15983 | https://www.fool.com/earnings/call-transcripts... | Philip Winslow -- Wells Fargo Securities -- An... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 195 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 |
| 15984 | https://www.fool.com/earnings/call-transcripts... | More MSFT analysis | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 196 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 |
| 15988 | https://www.fool.com/earnings/call-transcripts... | They just revealed what they believe are the t... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 197 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 |
| 15989 | https://www.fool.com/earnings/call-transcripts... | See the 10 stocks | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 198 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 |
| 15998 | https://www.fool.com/earnings/call-transcripts... | Market data powered by FactSet and Web Financi... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 199 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 |
513722 rows × 9 columns
df['quarter'] = ''
def q(x):
try:
return df.where(df['co_count'] == 0, df['transcript'].str.split(')')[1].str.split('Call')[0].str.strip())
except:
#try:
return df.where(df['co_count'] == 0, df['transcript'].str.split(df['ticker'])[1].str.split('Call')[0].str.strip().str.replace(' Earnings Conference', ''))
#except:
# return np.nan
df.quarter.drop_duplicates()
4 Name: quarter, dtype: object
Better way to get quarter from URL
df['quarter'] = df['url'].str.extract(r'([q]\d-\d{4})')#.str.split('-')\
df['quarter'] = df['quarter'].ffill()
Extra ad cleaning:
ad_list = ['10 stocks we like better than', 'They just revealed what they believe are the ten best stocks', 'See the 10 stocks',
'\*Stock Advisor returns as of']
df = df[~df.transcript.isin(ad_list)]
df = df[df['transcript'] != '']
for i in range(0, len(df)):
if i == 0:
df['co_count'].iloc[i] = 0
elif df['co_id'].iloc[i] == df['co_id'].iloc[i-1]:
df['co_count'].iloc[i] = df['co_count'].iloc[i-1] + 1
else:
df['co_count'].iloc[i] = 0
I had a speaker map in which I looked through the auto-generated speaker list to make sure nothing looked too off. I cleaned in Excel on the side, so now I am updating. If the transcript line is entirely a speaker name, then I know the transcript changed speakers, so I will changed the name in the speaker_full column.
df['speaker_full'] = df['transcript'].where(df['transcript'].isin(speaker_names), np.nan)
df
| url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | quarter | speaker_full | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | https://www.fool.com/earnings/call-transcripts... | American Tower Corp (NYSE:AMT)Q1 2019 Earning... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 0 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | NaN |
| 5 | https://www.fool.com/earnings/call-transcripts... | Operator | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 1 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | NaN |
| 6 | https://www.fool.com/earnings/call-transcripts... | Ladies and gentlemen, thank you for standing b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 2 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | NaN |
| 7 | https://www.fool.com/earnings/call-transcripts... | Igor Khislavsky -- Vice President, Investor Re... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 3 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | Igor Khislavsky -- Vice President, Investor Re... |
| 8 | https://www.fool.com/earnings/call-transcripts... | Thanks, Kevin. Good morning and thank you for ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 4 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15982 | https://www.fool.com/earnings/call-transcripts... | Brad Reback -- Stifel, Nicolaus & Company ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 194 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 | q2-2020 | Brad Reback -- Stifel, Nicolaus & Company ... |
| 15983 | https://www.fool.com/earnings/call-transcripts... | Philip Winslow -- Wells Fargo Securities -- An... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 195 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 | q2-2020 | Philip Winslow -- Wells Fargo Securities -- An... |
| 15984 | https://www.fool.com/earnings/call-transcripts... | More MSFT analysis | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 196 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 | q2-2020 | NaN |
| 15988 | https://www.fool.com/earnings/call-transcripts... | They just revealed what they believe are the t... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 197 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 | q2-2020 | NaN |
| 15998 | https://www.fool.com/earnings/call-transcripts... | Market data powered by FactSet and Web Financi... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 198 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 | q2-2020 | NaN |
511511 rows × 11 columns
df['speaker_full'] = df['speaker_full'].ffill() # fill NaNs in the speaker column
df
| url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | quarter | speaker_full | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | https://www.fool.com/earnings/call-transcripts... | American Tower Corp (NYSE:AMT)Q1 2019 Earning... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 0 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | NaN |
| 5 | https://www.fool.com/earnings/call-transcripts... | Operator | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 1 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | NaN |
| 6 | https://www.fool.com/earnings/call-transcripts... | Ladies and gentlemen, thank you for standing b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 2 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | NaN |
| 7 | https://www.fool.com/earnings/call-transcripts... | Igor Khislavsky -- Vice President, Investor Re... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 3 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | Igor Khislavsky -- Vice President, Investor Re... |
| 8 | https://www.fool.com/earnings/call-transcripts... | Thanks, Kevin. Good morning and thank you for ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 4 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | Igor Khislavsky -- Vice President, Investor Re... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15982 | https://www.fool.com/earnings/call-transcripts... | Brad Reback -- Stifel, Nicolaus & Company ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 194 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 | q2-2020 | Brad Reback -- Stifel, Nicolaus & Company ... |
| 15983 | https://www.fool.com/earnings/call-transcripts... | Philip Winslow -- Wells Fargo Securities -- An... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 195 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 | q2-2020 | Philip Winslow -- Wells Fargo Securities -- An... |
| 15984 | https://www.fool.com/earnings/call-transcripts... | More MSFT analysis | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 196 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 | q2-2020 | Philip Winslow -- Wells Fargo Securities -- An... |
| 15988 | https://www.fool.com/earnings/call-transcripts... | They just revealed what they believe are the t... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 197 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 | q2-2020 | Philip Winslow -- Wells Fargo Securities -- An... |
| 15998 | https://www.fool.com/earnings/call-transcripts... | Market data powered by FactSet and Web Financi... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 198 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 | q2-2020 | Philip Winslow -- Wells Fargo Securities -- An... |
511511 rows × 11 columns
Clean up speaker full so there is just a speaker column as well:
df['speaker'] = df['speaker_full']
df['speaker'] = np.where(df['co_count'] == 0, 'Call Title', df['speaker'])
df['speaker'] = np.where(df['transcript'] == 'Operator', 'Operator', df['speaker'])
df['speaker'] = df['speaker'].ffill()
df['speaker'] = df['speaker'].fillna('')
df['speaker_full'] = df['speaker']
df['speaker'] = df['speaker'].str.split('--').tolist()
df['speaker'] = df['speaker'].apply(lambda x: x[0].strip())
df
| url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | quarter | speaker_full | speaker | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | https://www.fool.com/earnings/call-transcripts... | American Tower Corp (NYSE:AMT)Q1 2019 Earning... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 0 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | Call Title | Call Title |
| 5 | https://www.fool.com/earnings/call-transcripts... | Operator | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 1 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | Operator | Operator |
| 6 | https://www.fool.com/earnings/call-transcripts... | Ladies and gentlemen, thank you for standing b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 2 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | Operator | Operator |
| 7 | https://www.fool.com/earnings/call-transcripts... | Igor Khislavsky -- Vice President, Investor Re... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 3 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | Igor Khislavsky -- Vice President, Investor Re... | Igor Khislavsky |
| 8 | https://www.fool.com/earnings/call-transcripts... | Thanks, Kevin. Good morning and thank you for ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 4 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | Igor Khislavsky -- Vice President, Investor Re... | Igor Khislavsky |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15982 | https://www.fool.com/earnings/call-transcripts... | Brad Reback -- Stifel, Nicolaus & Company ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 194 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 | q2-2020 | Brad Reback -- Stifel, Nicolaus & Company ... | Brad Reback |
| 15983 | https://www.fool.com/earnings/call-transcripts... | Philip Winslow -- Wells Fargo Securities -- An... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 195 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 | q2-2020 | Philip Winslow -- Wells Fargo Securities -- An... | Philip Winslow |
| 15984 | https://www.fool.com/earnings/call-transcripts... | More MSFT analysis | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 196 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 | q2-2020 | Philip Winslow -- Wells Fargo Securities -- An... | Philip Winslow |
| 15988 | https://www.fool.com/earnings/call-transcripts... | They just revealed what they believe are the t... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 197 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 | q2-2020 | Philip Winslow -- Wells Fargo Securities -- An... | Philip Winslow |
| 15998 | https://www.fool.com/earnings/call-transcripts... | Market data powered by FactSet and Web Financi... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 198 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 | q2-2020 | Philip Winslow -- Wells Fargo Securities -- An... | Philip Winslow |
511511 rows × 12 columns
df.groupby('speaker').count()
| url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | quarter | speaker_full | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| speaker | |||||||||||
| (Multiple speakers) Fund management | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
| A J Rice | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
| A brand new one. We bought 150 of them. In the last 12 years | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
| A. J. Rice | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 |
| A. William Stein | 95 | 95 | 95 | 95 | 95 | 95 | 95 | 95 | 95 | 95 | 95 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Zack Moxcey | 222 | 222 | 222 | 222 | 222 | 222 | 222 | 222 | 222 | 222 | 222 |
| Zack Sopcak | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 |
| Zane Chrane | 33 | 33 | 33 | 33 | 33 | 33 | 33 | 33 | 33 | 33 | 33 |
| carol lee | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 |
| of John Davis | 25 | 25 | 25 | 25 | 25 | 25 | 25 | 25 | 25 | 25 | 25 |
4157 rows × 11 columns
df[df.quarter == 'check'] # originally had some issues pulling the quarter from the transcript before I figured out the pattern
| url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | quarter | speaker_full | speaker |
|---|
df.quarter = np.where(df.quarter == 'check', 'Q3 2019', df.quarter) # this is no longer necessary
goog = df[df['quarter'].str.contains('GOOGL')].groupby('url').count().index.tolist()
qs = ['Q'+i.split('-q')[1].split('-earnings')[0].replace('-', ' ') for i in goog]
# I had some issues with some of the Google ticker formatting that I needed to fix.
goog_map = {}
k = 0
for i in goog:
goog_map.update({i:qs[k]})
k += 1
goog_map
{}
df['quarter'] = np.where(df['url'].isin(goog), df['url'].map(goog_map), df['quarter'])
df
| url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | quarter | speaker_full | speaker | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | https://www.fool.com/earnings/call-transcripts... | American Tower Corp (NYSE:AMT)Q1 2019 Earning... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 0 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | Call Title | Call Title |
| 5 | https://www.fool.com/earnings/call-transcripts... | Operator | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 1 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | Operator | Operator |
| 6 | https://www.fool.com/earnings/call-transcripts... | Ladies and gentlemen, thank you for standing b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 2 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | Operator | Operator |
| 7 | https://www.fool.com/earnings/call-transcripts... | Igor Khislavsky -- Vice President, Investor Re... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 3 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | Igor Khislavsky -- Vice President, Investor Re... | Igor Khislavsky |
| 8 | https://www.fool.com/earnings/call-transcripts... | Thanks, Kevin. Good morning and thank you for ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 4 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | Igor Khislavsky -- Vice President, Investor Re... | Igor Khislavsky |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15982 | https://www.fool.com/earnings/call-transcripts... | Brad Reback -- Stifel, Nicolaus & Company ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 194 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 | q2-2020 | Brad Reback -- Stifel, Nicolaus & Company ... | Brad Reback |
| 15983 | https://www.fool.com/earnings/call-transcripts... | Philip Winslow -- Wells Fargo Securities -- An... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 195 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 | q2-2020 | Philip Winslow -- Wells Fargo Securities -- An... | Philip Winslow |
| 15984 | https://www.fool.com/earnings/call-transcripts... | More MSFT analysis | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 196 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 | q2-2020 | Philip Winslow -- Wells Fargo Securities -- An... | Philip Winslow |
| 15988 | https://www.fool.com/earnings/call-transcripts... | They just revealed what they believe are the t... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 197 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 | q2-2020 | Philip Winslow -- Wells Fargo Securities -- An... | Philip Winslow |
| 15998 | https://www.fool.com/earnings/call-transcripts... | Market data powered by FactSet and Web Financi... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2116 | 198 | Microsoft Corp | NASDAQ:MSFT | MSFT | 2020-01-30 | q2-2020 | Philip Winslow -- Wells Fargo Securities -- An... | Philip Winslow |
511511 rows × 12 columns
df[df['quarter'].str.contains('GOOG')]
| url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | quarter | speaker_full | speaker |
|---|
df['quarter'] = np.where(df['quarter'].str.contains('GOOG'), 'Q4 2018', df['quarter'])
df = pd.merge(df, sp_map, on='ticker')
df = df.drop(columns = ['CIK', 'SEC filings'], axis = 1).rename(columns = {'Security':'co_clean', 'Headquarters Location':'hq', 'Date first added':'date_added', 'Founded':'founded'})
df
| url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | quarter | speaker_full | speaker | co_clean | sector | sub_sector | hq | date_added | founded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | https://www.fool.com/earnings/call-transcripts... | American Tower Corp (NYSE:AMT)Q1 2019 Earning... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 0 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | Call Title | Call Title | American Tower | Real Estate | Specialized REITs | Boston, Massachusetts | 11/19/07 | 1995 |
| 1 | https://www.fool.com/earnings/call-transcripts... | Operator | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 1 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | Operator | Operator | American Tower | Real Estate | Specialized REITs | Boston, Massachusetts | 11/19/07 | 1995 |
| 2 | https://www.fool.com/earnings/call-transcripts... | Ladies and gentlemen, thank you for standing b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 2 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | Operator | Operator | American Tower | Real Estate | Specialized REITs | Boston, Massachusetts | 11/19/07 | 1995 |
| 3 | https://www.fool.com/earnings/call-transcripts... | Igor Khislavsky -- Vice President, Investor Re... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 3 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | Igor Khislavsky -- Vice President, Investor Re... | Igor Khislavsky | American Tower | Real Estate | Specialized REITs | Boston, Massachusetts | 11/19/07 | 1995 |
| 4 | https://www.fool.com/earnings/call-transcripts... | Thanks, Kevin. Good morning and thank you for ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 4 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | q1-2019 | Igor Khislavsky -- Vice President, Investor Re... | Igor Khislavsky | American Tower | Real Estate | Specialized REITs | Boston, Massachusetts | 11/19/07 | 1995 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 508234 | https://www.fool.com/earnings/call-transcripts... | Aaron Rakers -- Wells Fargo -- Analyst | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 150 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | q2-2020 | Aaron Rakers -- Wells Fargo -- Analyst | Aaron Rakers | Nvidia | Information Technology | Semiconductors | Santa Clara, California | 11/30/01 | 1993 |
| 508235 | https://www.fool.com/earnings/call-transcripts... | Stacy Rasgon -- Bernstein Research -- Analyst | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 151 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | q2-2020 | Stacy Rasgon -- Bernstein Research -- Analyst | Stacy Rasgon | Nvidia | Information Technology | Semiconductors | Santa Clara, California | 11/30/01 | 1993 |
| 508236 | https://www.fool.com/earnings/call-transcripts... | More NVDA analysis | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 152 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | q2-2020 | Stacy Rasgon -- Bernstein Research -- Analyst | Stacy Rasgon | Nvidia | Information Technology | Semiconductors | Santa Clara, California | 11/30/01 | 1993 |
| 508237 | https://www.fool.com/earnings/call-transcripts... | They just revealed what they believe are the t... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 153 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | q2-2020 | Stacy Rasgon -- Bernstein Research -- Analyst | Stacy Rasgon | Nvidia | Information Technology | Semiconductors | Santa Clara, California | 11/30/01 | 1993 |
| 508238 | https://www.fool.com/earnings/call-transcripts... | Market data powered by FactSet and Web Financi... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 154 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | q2-2020 | Stacy Rasgon -- Bernstein Research -- Analyst | Stacy Rasgon | Nvidia | Information Technology | Semiconductors | Santa Clara, California | 11/30/01 | 1993 |
508239 rows × 18 columns
df['url'].str.split('call-transcripts/').map(lambda r: r[1]).str.split('/')\
.map(lambda r: r[:3]).map(lambda r: '-'.join(r)) # gets date from URL pattern
0 2019-05-03
1 2019-05-03
2 2019-05-03
3 2019-05-03
4 2019-05-03
...
508234 2019-08-16
508235 2019-08-16
508236 2019-08-16
508237 2019-08-16
508238 2019-08-16
Name: url, Length: 508239, dtype: object
df.to_csv('../proj_data/output_data/data_clean.csv') # main output file
df.drop_duplicates('url').groupby('sector').count()
| url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | quarter | speaker_full | speaker | co_clean | sub_sector | hq | date_added | founded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sector | |||||||||||||||||
| Communication Services | 123 | 123 | 123 | 123 | 123 | 123 | 123 | 123 | 123 | 123 | 123 | 123 | 123 | 123 | 123 | 123 | 123 |
| Consumer Discretionary | 170 | 170 | 170 | 170 | 170 | 170 | 170 | 170 | 170 | 170 | 170 | 170 | 170 | 170 | 170 | 160 | 170 |
| Consumer Staples | 125 | 125 | 125 | 125 | 125 | 125 | 125 | 125 | 125 | 125 | 125 | 125 | 125 | 125 | 125 | 125 | 125 |
| Energy | 49 | 49 | 49 | 49 | 49 | 49 | 49 | 49 | 49 | 49 | 49 | 49 | 49 | 49 | 49 | 49 | 49 |
| Financials | 234 | 234 | 234 | 234 | 234 | 234 | 234 | 234 | 234 | 234 | 234 | 234 | 234 | 234 | 234 | 199 | 234 |
| Health Care | 358 | 358 | 358 | 358 | 358 | 358 | 358 | 358 | 358 | 358 | 358 | 358 | 358 | 358 | 358 | 338 | 358 |
| Industrials | 219 | 219 | 219 | 219 | 219 | 219 | 219 | 219 | 219 | 219 | 219 | 219 | 219 | 219 | 219 | 189 | 219 |
| Information Technology | 345 | 345 | 345 | 345 | 345 | 345 | 345 | 345 | 345 | 345 | 345 | 345 | 345 | 345 | 345 | 296 | 345 |
| Materials | 78 | 78 | 78 | 78 | 78 | 78 | 78 | 78 | 78 | 78 | 78 | 78 | 78 | 78 | 78 | 67 | 78 |
| Real Estate | 80 | 80 | 80 | 80 | 80 | 80 | 80 | 80 | 80 | 80 | 80 | 80 | 80 | 80 | 80 | 80 | 80 |
| Utilities | 79 | 79 | 79 | 79 | 79 | 79 | 79 | 79 | 79 | 79 | 79 | 79 | 79 | 79 | 79 | 59 | 79 |
df.drop_duplicates('co_clean').count() # some date addeds are missing in my mapping file from Wikipedia
url 196 transcript 196 file 196 co_id 196 co_count 196 co_name 196 ticker_full 196 ticker 196 date 196 quarter 196 speaker_full 196 speaker 196 co_clean 196 sector 196 sub_sector 196 hq 196 date_added 178 founded 196 dtype: int64
The raw data are transformed and cleansed. They were sent to a CSV. There are 196 companies in the data with about 2000 conference calls.
df.drop_duplicates(['ticker', 'quarter']).groupby('ticker').agg({'ticker': 'count', 'date':['min', 'max']})
| ticker | date | ||
|---|---|---|---|
| count | min | max | |
| ticker | |||
| A | 10 | 2019-02-21 | 2021-05-25 |
| AAPL | 10 | 2019-01-29 | 2021-04-29 |
| ABBV | 10 | 2019-01-25 | 2021-04-30 |
| ABT | 10 | 2018-07-18 | 2021-04-20 |
| ACN | 10 | 2019-03-28 | 2021-06-24 |
| ... | ... | ... | ... |
| WM | 10 | 2019-02-14 | 2021-04-27 |
| WMT | 4 | 2018-11-15 | 2021-05-18 |
| XEL | 10 | 2019-01-31 | 2021-04-29 |
| XOM | 10 | 2019-02-01 | 2021-04-30 |
| ZTS | 10 | 2019-02-14 | 2021-05-06 |
196 rows × 3 columns
df.drop_duplicates(['ticker', 'quarter']).groupby('ticker').agg({'ticker': 'count'}).median()
ticker 10.0 dtype: float64
Purpose: This section connects to Yahoo Finance and pulls the share-price information and market caps for each of the companies in the clean dataset. I also add a Q&A section.
import yfinance as yf
import pandas as pd
The code below is a test snippet on Amazon, which is later built out into the full loop on the entire dataset.
Amazon = yf.Ticker("AMZN")
print(Amazon.history(start='2021-1-1')['Close'][0])
Amazon.info['marketCap']
3186.6298828125
1673027780608
Amazon = yf.Ticker("AMZN")
print(Amazon.history(start='2021-1-1')['Close'])
Date
2021-01-04 3186.629883
2021-01-05 3218.510010
2021-01-06 3138.379883
2021-01-07 3162.159912
2021-01-08 3182.699951
...
2021-08-06 3344.939941
2021-08-09 3341.870117
2021-08-10 3320.679932
2021-08-11 3292.110107
2021-08-12 3303.500000
Name: Close, Length: 154, dtype: float64
df = pd.read_csv('../proj_data/output_data/data_clean.csv') # from prior section
Get a DF containing just the tickers and dates within the file:
co_dates = df[['ticker', 'date']].drop_duplicates()
co_dates
| ticker | date | |
|---|---|---|
| 0 | AMT | 2019-05-03 |
| 215 | AMT | 2020-02-25 |
| 439 | AMT | 2019-02-27 |
| 611 | AMT | 2019-10-31 |
| 839 | AMT | 2021-02-25 |
| ... | ... | ... |
| 507340 | NVDA | 2019-05-20 |
| 507518 | NVDA | 2021-05-27 |
| 507666 | NVDA | 2020-08-20 |
| 507858 | NVDA | 2020-05-22 |
| 508084 | NVDA | 2019-08-16 |
1854 rows × 2 columns
Get min and max call dates for each company:
min_max = co_dates.groupby('ticker').agg({'date': ['min', 'max']}).reset_index()
min_co = co_dates.groupby('ticker').min('date').reset_index()
min_co.columns
Index(['ticker', 'date'], dtype='object')
#min_co
Test on Amazon's min date. The idea is to get the minimum date's stock price through to the day the data were pulled in a DF. Later on, this DF can be used as a reference table to provide price information needed for modeling.
min_date = min_co[min_co['ticker']=='AMZN']['date'].iloc[0]
print(yf.Ticker("AMZN").history(start=min_date)['Close'])
Date
2019-01-31 1718.729980
2019-02-01 1626.229980
2019-02-04 1633.310059
2019-02-05 1658.810059
2019-02-06 1640.260010
...
2021-08-06 3344.939941
2021-08-09 3341.870117
2021-08-10 3320.679932
2021-08-11 3292.110107
2021-08-12 3303.500000
Name: Close, Length: 639, dtype: float64
Loop through grouped DF to find the prices for each ticker. Yahoo Finance also contains market caps, but I couldn't look by date, so they are all as of when this file was last run (August 12 in the case of my data). This isn't ideal, but it's okay for my purposes and is something I will look into further. I add these market caps to a dictionary to be used later.
mc_dict = {}
df_prices = pd.DataFrame()
for ticker in list(min_co['ticker']):
min_date = min_co[min_co['ticker']==ticker]['date'].iloc[0]
temp = pd.DataFrame(yf.Ticker(ticker).history(start=min_date)['Close']).reset_index()
temp['ticker'] = ticker
df_prices = df_prices.append(temp)
mc_dict[ticker] = yf.Ticker(ticker).info['marketCap']
print(ticker)
A AAPL ABBV ABT ACN ADBE ADI ADP ADSK AEP AIG ALGN ALL ALXN AMAT AMD AMGN AMT AMZN ANTM AON APD APH APTV ATVI AVGO AXP BA BAC BAX BDX BIIB BK BKNG BLK BMY BSX C CARR CAT CB CCI CDNS CHTR CI CL CMCSA CME CMG CNC COF COP COST CRM CSCO CSX CVS CVX D DD DE DG DHR DIS DLR DOW DUK DXCM EA EBAY ECL EL EMR EOG EQIX EW EXC F FB FCX FDX FIS FISV GD GE GILD GIS GM GOOGL GPN GS HCA HD HON HUM IBM ICE IDXX ILMN INFO INTC INTU IQV ISRG ITW JCI JNJ JPM KLAC KMB KO LHX LIN LLY LMT LOW LRCX MA MCD MDLZ MDT MET MMC MMM MO MRK MS MSCI MSFT MSI MU NEE NEM NFLX NKE NOC NOW NSC NVDA NXPI ORCL ORLY PEP PFE PG PGR PH PLD PM PNC PPG PRU PSA PYPL QCOM REGN RMD ROP ROST RTX SBAC SBUX SHW SLB SNPS SO SPG SPGI SRE STZ SYK T TEL TFC TGT TJX TMO TMUS TRV TSLA TT TWTR TXN UNH UNP UPS USB V VRTX VZ WFC WM WMT XEL XOM ZTS
#temp
Show dictionary containing market caps for each company:
list(mc_dict.items())[:10] # show top lines to make sure the dictionary worked
[('A', 47989510144),
('AAPL', 2461181411328),
('ABBV', 204356698112),
('ABT', 217720012800),
('ACN', 204039929856),
('ADBE', 302204321792),
('ADI', 62176858112),
('ADP', 90920099840),
('ADSK', 73055313920),
('AEP', 44497326080)]
Show price table:
df_prices
| Date | Close | ticker | |
|---|---|---|---|
| 0 | 2019-02-21 | 76.392479 | A |
| 1 | 2019-02-22 | 76.912292 | A |
| 2 | 2019-02-25 | 77.814606 | A |
| 3 | 2019-02-26 | 77.039795 | A |
| 4 | 2019-02-27 | 77.893066 | A |
| ... | ... | ... | ... |
| 623 | 2021-08-06 | 201.880005 | ZTS |
| 624 | 2021-08-09 | 199.720001 | ZTS |
| 625 | 2021-08-10 | 198.759995 | ZTS |
| 626 | 2021-08-11 | 197.410004 | ZTS |
| 627 | 2021-08-12 | 199.839996 | ZTS |
126529 rows × 3 columns
Merge the two (i.e. map the dictionary to the price table)
df_prices['market_cap'] = df_prices['ticker'].map(mc_dict)
df_prices
| Date | Close | ticker | market_cap | |
|---|---|---|---|---|
| 0 | 2019-02-21 | 76.392479 | A | 47989510144 |
| 1 | 2019-02-22 | 76.912292 | A | 47989510144 |
| 2 | 2019-02-25 | 77.814606 | A | 47989510144 |
| 3 | 2019-02-26 | 77.039795 | A | 47989510144 |
| 4 | 2019-02-27 | 77.893066 | A | 47989510144 |
| ... | ... | ... | ... | ... |
| 623 | 2021-08-06 | 201.880005 | ZTS | 94877433856 |
| 624 | 2021-08-09 | 199.720001 | ZTS | 94877433856 |
| 625 | 2021-08-10 | 198.759995 | ZTS | 94877433856 |
| 626 | 2021-08-11 | 197.410004 | ZTS | 94877433856 |
| 627 | 2021-08-12 | 199.839996 | ZTS | 94877433856 |
126529 rows × 4 columns
df_prices.to_csv('../proj_data/output_data/comps_prices.csv') # send to CSV
df['date']=df.date.astype('datetime64[ns]')
df_prices['Date'] = df_prices.Date.astype('datetime64[ns]')
df_prices.head(5)
| Date | Close | ticker | market_cap | |
|---|---|---|---|---|
| 0 | 2019-02-21 | 76.392479 | A | 47989510144 |
| 1 | 2019-02-22 | 76.912292 | A | 47989510144 |
| 2 | 2019-02-25 | 77.814606 | A | 47989510144 |
| 3 | 2019-02-26 | 77.039795 | A | 47989510144 |
| 4 | 2019-02-27 | 77.893066 | A | 47989510144 |
new_df = df
new_df['close_0'] = pd.merge(df_prices, df, how='right', left_on=['Date','ticker'], right_on = ['date','ticker'])['Close']
new_df['market_cap'] = pd.merge(df_prices, df, how='right', left_on=['Date','ticker'], right_on = ['date','ticker'])['market_cap']
new_df['close_0'].unique()
array([185.05751038, 239.17320251, 165.35540771, ..., 121.26818085,
90.11592865, 39.73274231])
new_df.columns
Index(['Unnamed: 0', 'url', 'transcript', 'file', 'co_id', 'co_count',
'co_name', 'ticker_full', 'ticker', 'date', 'quarter', 'speaker_full',
'speaker', 'co_clean', 'sector', 'sub_sector', 'hq', 'date_added',
'founded', 'close_0', 'market_cap'],
dtype='object')
Now the closing prices per company as of each call date are in the data.
We now want to create a set of columns representing weekly days out from the conference call date up through 3 months out (i.e. roughly the next earnings call)
from datetime import timedelta
for i in range(1, 91): # create date columns
new_df['date_'+str(i)] = new_df.date + timedelta(days=i) # create a new column that's the date in the date column plus 1-90 days
new_df.head(5)
| Unnamed: 0 | url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | ... | date_81 | date_82 | date_83 | date_84 | date_85 | date_86 | date_87 | date_88 | date_89 | date_90 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | https://www.fool.com/earnings/call-transcripts... | American Tower Corp (NYSE:AMT)Q1 2019 Earning... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 0 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 2019-07-23 | 2019-07-24 | 2019-07-25 | 2019-07-26 | 2019-07-27 | 2019-07-28 | 2019-07-29 | 2019-07-30 | 2019-07-31 | 2019-08-01 |
| 1 | 1 | https://www.fool.com/earnings/call-transcripts... | Operator | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 1 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 2019-07-23 | 2019-07-24 | 2019-07-25 | 2019-07-26 | 2019-07-27 | 2019-07-28 | 2019-07-29 | 2019-07-30 | 2019-07-31 | 2019-08-01 |
| 2 | 2 | https://www.fool.com/earnings/call-transcripts... | Ladies and gentlemen, thank you for standing b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 2 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 2019-07-23 | 2019-07-24 | 2019-07-25 | 2019-07-26 | 2019-07-27 | 2019-07-28 | 2019-07-29 | 2019-07-30 | 2019-07-31 | 2019-08-01 |
| 3 | 3 | https://www.fool.com/earnings/call-transcripts... | Igor Khislavsky -- Vice President, Investor Re... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 3 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 2019-07-23 | 2019-07-24 | 2019-07-25 | 2019-07-26 | 2019-07-27 | 2019-07-28 | 2019-07-29 | 2019-07-30 | 2019-07-31 | 2019-08-01 |
| 4 | 4 | https://www.fool.com/earnings/call-transcripts... | Thanks, Kevin. Good morning and thank you for ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 4 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 2019-07-23 | 2019-07-24 | 2019-07-25 | 2019-07-26 | 2019-07-27 | 2019-07-28 | 2019-07-29 | 2019-07-30 | 2019-07-31 | 2019-08-01 |
5 rows × 111 columns
Now we can lookup the prices for each of these dates from the pricing table
The following loop creates a column called close_n which will contain the close price n days from the call date (this date is found in the columns created above). It uses the DF created in the Yahoo Finance step as a lookup table.
for i in range(1, 91): # merge each of the dates in these 90 date columns with the price data from Yahoo
s = i
new_df['close_'+str(s)] = pd.merge(df_prices, new_df, how='right', left_on=['Date','ticker'], right_on = ['date_'+str(s),'ticker'])['Close']
new_df.head()
| Unnamed: 0 | url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | ... | close_81 | close_82 | close_83 | close_84 | close_85 | close_86 | close_87 | close_88 | close_89 | close_90 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | https://www.fool.com/earnings/call-transcripts... | American Tower Corp (NYSE:AMT)Q1 2019 Earning... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 0 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 199.196991 | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 |
| 1 | 1 | https://www.fool.com/earnings/call-transcripts... | Operator | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 1 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 199.196991 | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 |
| 2 | 2 | https://www.fool.com/earnings/call-transcripts... | Ladies and gentlemen, thank you for standing b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 2 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 199.196991 | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 |
| 3 | 3 | https://www.fool.com/earnings/call-transcripts... | Igor Khislavsky -- Vice President, Investor Re... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 3 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 199.196991 | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 |
| 4 | 4 | https://www.fool.com/earnings/call-transcripts... | Thanks, Kevin. Good morning and thank you for ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 4 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 199.196991 | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 |
5 rows × 201 columns
new_df.ticker.unique()
array(['AMT', 'RTX', 'IBM', 'AMAT', 'TGT', 'BLK', 'GS', 'CAT', 'ISRG',
'AMD', 'CVS', 'GE', 'AXP', 'NOW', 'MMM', 'DE', 'AEP', 'INFO', 'GD',
'ALGN', 'MET', 'DXCM', 'SNPS', 'IDXX', 'F', 'HCA', 'GPN', 'TWTR',
'NXPI', 'D', 'REGN', 'HUM', 'PGR', 'EMR', 'EXC', 'TEL', 'CMG',
'EBAY', 'LHX', 'DLR', 'DOW', 'XEL', 'MSI', 'PH', 'GIS', 'RMD',
'SBAC', 'SLB', 'EA', 'ROST', 'CNC', 'ORLY', 'EOG', 'SRE', 'BAX',
'STZ', 'PRU', 'CDNS', 'ALL', 'TRV', 'BK', 'SPG', 'A', 'PSA', 'IQV',
'TT', 'MSCI', 'KLAC', 'FCX', 'AON', 'NEM', 'ROP', 'JCI', 'BIIB',
'KMB', 'WM', 'ECL', 'VRTX', 'NOC', 'DG', 'SPGI', 'ZTS', 'LMT',
'PLD', 'ANTM', 'CHTR', 'LRCX', 'FIS', 'BKNG', 'ADP', 'SYK', 'TMUS',
'GILD', 'TJX', 'CCI', 'PNC', 'COP', 'MDLZ', 'MU', 'MO', 'ILMN',
'ICE', 'SHW', 'ITW', 'NSC', 'EW', 'CL', 'SO', 'FISV', 'ADSK',
'BSX', 'APD', 'ADI', 'CSX', 'MMC', 'ATVI', 'BDX', 'FDX', 'EQIX',
'CB', 'DUK', 'TFC', 'CME', 'COF', 'USB', 'EL', 'GM', 'CI', 'ALXN',
'PPG', 'CARR', 'AIG', 'APH', 'DD', 'APTV', 'MRK', 'ACN', 'TMO',
'CVX', 'LLY', 'T', 'NKE', 'AVGO', 'WMT', 'TXN', 'COST', 'DHR',
'MCD', 'MDT', 'QCOM', 'WFC', 'C', 'INTU', 'MS', 'BA', 'LOW',
'AMGN', 'SBUX', 'HON', 'ORCL', 'LIN', 'NEE', 'BMY', 'PM', 'UNP',
'UPS', 'UNH', 'PG', 'CMCSA', 'BAC', 'JPM', 'HD', 'JNJ', 'V', 'MA',
'DIS', 'PYPL', 'ADBE', 'CRM', 'PEP', 'XOM', 'KO', 'CSCO', 'ABBV',
'VZ', 'INTC', 'PFE', 'NFLX', 'ABT', 'FB', 'MSFT', 'AAPL', 'GOOGL',
'AMZN', 'TSLA', 'NVDA'], dtype=object)
new_df.speaker.unique()
array(['Call Title', 'Operator', 'Igor Khislavsky', ...,
'Simona Jankowski', 'Colette Kress', 'Jensen Huang'], dtype=object)
new_df = new_df[new_df.transcript != new_df.speaker_full] # remove lines where the transcript is exactly a speaker
import numpy as np
new_df['qa'] = np.where(new_df['co_count'] == 0, 'pres', None)
new_df.head()
| Unnamed: 0 | url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | ... | close_82 | close_83 | close_84 | close_85 | close_86 | close_87 | close_88 | close_89 | close_90 | qa | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | https://www.fool.com/earnings/call-transcripts... | American Tower Corp (NYSE:AMT)Q1 2019 Earning... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 0 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| 2 | 2 | https://www.fool.com/earnings/call-transcripts... | Ladies and gentlemen, thank you for standing b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 2 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | None |
| 4 | 4 | https://www.fool.com/earnings/call-transcripts... | Thanks, Kevin. Good morning and thank you for ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 4 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | None |
| 5 | 5 | https://www.fool.com/earnings/call-transcripts... | We've posted a presentation, which we'll refer... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 5 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | None |
| 6 | 6 | https://www.fool.com/earnings/call-transcripts... | Before I begin, I'll remind you that this call... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 6 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | None |
5 rows × 202 columns
There is a pattern on the speaker_full column to identify an analyst from a bank, research firm, etc. vs a company employee. If it's too early in the call, then it shouldn't be an analyst. I don't want to start the QA too soon.
new_df['qa'] = np.where((new_df['co_count'] > 5) & (new_df['speaker_full'].str.contains('--\D*--')), 'qa', new_df['qa'])
new_df.head()
| Unnamed: 0 | url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | ... | close_82 | close_83 | close_84 | close_85 | close_86 | close_87 | close_88 | close_89 | close_90 | qa | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | https://www.fool.com/earnings/call-transcripts... | American Tower Corp (NYSE:AMT)Q1 2019 Earning... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 0 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| 2 | 2 | https://www.fool.com/earnings/call-transcripts... | Ladies and gentlemen, thank you for standing b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 2 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | None |
| 4 | 4 | https://www.fool.com/earnings/call-transcripts... | Thanks, Kevin. Good morning and thank you for ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 4 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | None |
| 5 | 5 | https://www.fool.com/earnings/call-transcripts... | We've posted a presentation, which we'll refer... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 5 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | None |
| 6 | 6 | https://www.fool.com/earnings/call-transcripts... | Before I begin, I'll remind you that this call... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 6 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | None |
5 rows × 202 columns
new_df[new_df['qa']=='qa'] # check some of the QA data
| Unnamed: 0 | url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | ... | close_82 | close_83 | close_84 | close_85 | close_86 | close_87 | close_88 | close_89 | close_90 | qa | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 53 | 53 | https://www.fool.com/earnings/call-transcripts... | Great, thanks for taking my questions. I had t... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 53 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | qa |
| 54 | 54 | https://www.fool.com/earnings/call-transcripts... | Secondly, one of the things that we get questi... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 54 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | qa |
| 60 | 60 | https://www.fool.com/earnings/call-transcripts... | And I guess, if I could just follow up quickly... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 60 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | qa |
| 66 | 66 | https://www.fool.com/earnings/call-transcripts... | Great, thank you. As it relates to your innova... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 66 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | qa |
| 67 | 67 | https://www.fool.com/earnings/call-transcripts... | And then secondly, as it relates to the US org... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 67 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | qa |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 508202 | 508202 | https://www.fool.com/earnings/call-transcripts... | Yeah, thanks for taking the questions. And con... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 118 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | qa |
| 508212 | 508212 | https://www.fool.com/earnings/call-transcripts... | Hi guys, thanks for taking my questions. I hav... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 128 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | qa |
| 508236 | 508236 | https://www.fool.com/earnings/call-transcripts... | More NVDA analysis | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 152 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | qa |
| 508237 | 508237 | https://www.fool.com/earnings/call-transcripts... | They just revealed what they believe are the t... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 153 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | qa |
| 508238 | 508238 | https://www.fool.com/earnings/call-transcripts... | Market data powered by FactSet and Web Financi... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 154 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | qa |
74569 rows × 202 columns
new_df['qa'] = new_df['qa'].ffill() # forward fill the NAs and Nones in the QA column
new_df.head()
| Unnamed: 0 | url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | ... | close_82 | close_83 | close_84 | close_85 | close_86 | close_87 | close_88 | close_89 | close_90 | qa | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | https://www.fool.com/earnings/call-transcripts... | American Tower Corp (NYSE:AMT)Q1 2019 Earning... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 0 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| 2 | 2 | https://www.fool.com/earnings/call-transcripts... | Ladies and gentlemen, thank you for standing b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 2 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| 4 | 4 | https://www.fool.com/earnings/call-transcripts... | Thanks, Kevin. Good morning and thank you for ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 4 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| 5 | 5 | https://www.fool.com/earnings/call-transcripts... | We've posted a presentation, which we'll refer... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 5 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| 6 | 6 | https://www.fool.com/earnings/call-transcripts... | Before I begin, I'll remind you that this call... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 6 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
5 rows × 202 columns
new_df['qa'].unique()
array(['pres', 'qa'], dtype=object)
Check some of the data from both splits of Q&A to make sure they pass the eye test.
new_df[new_df['qa']=='qa']
| Unnamed: 0 | url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | ... | close_82 | close_83 | close_84 | close_85 | close_86 | close_87 | close_88 | close_89 | close_90 | qa | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 53 | 53 | https://www.fool.com/earnings/call-transcripts... | Great, thanks for taking my questions. I had t... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 53 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | qa |
| 54 | 54 | https://www.fool.com/earnings/call-transcripts... | Secondly, one of the things that we get questi... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 54 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | qa |
| 56 | 56 | https://www.fool.com/earnings/call-transcripts... | So Brandon, good morning. The percent of US bu... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 56 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | qa |
| 58 | 58 | https://www.fool.com/earnings/call-transcripts... | And just to add on, just on the color on the g... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 58 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | qa |
| 60 | 60 | https://www.fool.com/earnings/call-transcripts... | And I guess, if I could just follow up quickly... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 60 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | qa |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 508222 | 508222 | https://www.fool.com/earnings/call-transcripts... | [Operator Closing Remarks] | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 138 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | qa |
| 508223 | 508223 | https://www.fool.com/earnings/call-transcripts... | Duration: 58 minutes | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 139 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | qa |
| 508236 | 508236 | https://www.fool.com/earnings/call-transcripts... | More NVDA analysis | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 152 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | qa |
| 508237 | 508237 | https://www.fool.com/earnings/call-transcripts... | They just revealed what they believe are the t... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 153 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | qa |
| 508238 | 508238 | https://www.fool.com/earnings/call-transcripts... | Market data powered by FactSet and Web Financi... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 154 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | qa |
216043 rows × 202 columns
new_df[new_df['qa']=='pres']
| Unnamed: 0 | url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | ... | close_82 | close_83 | close_84 | close_85 | close_86 | close_87 | close_88 | close_89 | close_90 | qa | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | https://www.fool.com/earnings/call-transcripts... | American Tower Corp (NYSE:AMT)Q1 2019 Earning... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 0 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| 2 | 2 | https://www.fool.com/earnings/call-transcripts... | Ladies and gentlemen, thank you for standing b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 2 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| 4 | 4 | https://www.fool.com/earnings/call-transcripts... | Thanks, Kevin. Good morning and thank you for ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 4 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| 5 | 5 | https://www.fool.com/earnings/call-transcripts... | We've posted a presentation, which we'll refer... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 5 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| 6 | 6 | https://www.fool.com/earnings/call-transcripts... | Before I begin, I'll remind you that this call... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 6 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 508109 | 508109 | https://www.fool.com/earnings/call-transcripts... | Moving to the rest of the P&L. Q2, GAAP gr... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 25 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | pres |
| 508110 | 508110 | https://www.fool.com/earnings/call-transcripts... | With that let, me turn to the outlook for the ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 26 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | pres |
| 508111 | 508111 | https://www.fool.com/earnings/call-transcripts... | Further financial details are included in the ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 27 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | pres |
| 508113 | 508113 | https://www.fool.com/earnings/call-transcripts... | Operator, can you poll for questions, please. | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 29 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | pres |
| 508115 | 508115 | https://www.fool.com/earnings/call-transcripts... | [Operator Instructions] And your first questio... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 31 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | pres |
96938 rows × 202 columns
Add market cap to the data frame:
new_df['market_cap'] = new_df['ticker'].map(mc_dict)
Write new file to a CSV:
new_df.to_csv('../proj_data/output_data/data_clean2.csv')
close_list = [] # will contain close_n for 1-90
for i in range(1, 91):
s = i
close_list.append('close_'+str(s))
Make a smaller DF with price information:
date_list = [] # similar idea as above
for i in range(1, 91):
s = i
date_list.append('date_'+str(s))
# index with the new lists and send to CSV
new_df[['ticker', 'qa', 'quarter', 'co_clean', 'date', 'close_0', 'market_cap'] + close_list +date_list].drop_duplicates().to_csv('../proj_data/output_data/price_map.csv')
#del new_df # was running into memory issues, so used this to cleanup data
Purpose: This file creates the OHCO files and digital corpus (F1, F2, etc.)
import pandas as pd
import os
import nltk
from tqdm import tqdm
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('tagsets')
[nltk_data] Downloading package punkt to /Users/dfuent/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package averaged_perceptron_tagger to [nltk_data] /Users/dfuent/nltk_data... [nltk_data] Package averaged_perceptron_tagger is already up-to- [nltk_data] date! [nltk_data] Downloading package stopwords to [nltk_data] /Users/dfuent/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package tagsets to /Users/dfuent/nltk_data... [nltk_data] Package tagsets is already up-to-date!
True
OHCO = ['ticker', 'speaker', 'quarter', 'qa'] # my OHCO indices
df = pd.read_csv('../proj_data/output_data/data_clean2.csv').drop('Unnamed: 0', axis = 1)
#df
df['quarter'] = df['quarter'].str.replace('Earnings', '')\
.str.replace('Conference', '').str.replace('Fiscal Year', '').str.strip()
df
| Unnamed: 0.1 | url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | ... | close_82 | close_83 | close_84 | close_85 | close_86 | close_87 | close_88 | close_89 | close_90 | qa | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | https://www.fool.com/earnings/call-transcripts... | American Tower Corp (NYSE:AMT)Q1 2019 Earning... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 0 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| 1 | 2 | https://www.fool.com/earnings/call-transcripts... | Ladies and gentlemen, thank you for standing b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 2 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| 2 | 4 | https://www.fool.com/earnings/call-transcripts... | Thanks, Kevin. Good morning and thank you for ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 4 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| 3 | 5 | https://www.fool.com/earnings/call-transcripts... | We've posted a presentation, which we'll refer... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 5 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| 4 | 6 | https://www.fool.com/earnings/call-transcripts... | Before I begin, I'll remind you that this call... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 6 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 312976 | 508222 | https://www.fool.com/earnings/call-transcripts... | [Operator Closing Remarks] | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 138 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | qa |
| 312977 | 508223 | https://www.fool.com/earnings/call-transcripts... | Duration: 58 minutes | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 139 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | qa |
| 312978 | 508236 | https://www.fool.com/earnings/call-transcripts... | More NVDA analysis | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 152 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | qa |
| 312979 | 508237 | https://www.fool.com/earnings/call-transcripts... | They just revealed what they believe are the t... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 153 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | qa |
| 312980 | 508238 | https://www.fool.com/earnings/call-transcripts... | Market data powered by FactSet and Web Financi... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 154 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | qa |
312981 rows × 202 columns
Data are in a DF. Some extra cleaning (I don't think this is necessary any longer; pretty sure the data are clean and this is an artifact of something I worked on a few weeks ago.).
df.quarter.drop_duplicates()
0 q1-2019 129 q4-2019 298 q4-2018 416 q3-2019 564 q4-2020 780 q2-2020 971 q2-2019 1082 q3-2020 1284 q1-2021 1460 q1-2020 1729 q2-2018 2842 q3-2018 3015 q4-2017 4993 q2-2021 24923 q1-2018 67113 q3-2021 68732 q4-2021 100695 q1-2022 307792 q3-2017 Name: quarter, dtype: object
df.quarter.drop_duplicates()
df['co_name'] = df['co_name'].str.strip()
LIB = df[['url', 'file', 'co_clean', 'quarter', 'ticker', 'sector', 'sub_sector', 'hq', 'date_added','founded']]
LIB.drop_duplicates(inplace = True)
LIB.to_csv('../proj_data/output_data/LIB.csv') # send to CSV
LIB
| url | file | co_clean | quarter | ticker | sector | sub_sector | hq | date_added | founded | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | https://www.fool.com/earnings/call-transcripts... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | American Tower | q1-2019 | AMT | Real Estate | Specialized REITs | Boston, Massachusetts | 11/19/07 | 1995 |
| 129 | https://www.fool.com/earnings/call-transcripts... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | American Tower | q4-2019 | AMT | Real Estate | Specialized REITs | Boston, Massachusetts | 11/19/07 | 1995 |
| 298 | https://www.fool.com/earnings/call-transcripts... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | American Tower | q4-2018 | AMT | Real Estate | Specialized REITs | Boston, Massachusetts | 11/19/07 | 1995 |
| 416 | https://www.fool.com/earnings/call-transcripts... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | American Tower | q3-2019 | AMT | Real Estate | Specialized REITs | Boston, Massachusetts | 11/19/07 | 1995 |
| 564 | https://www.fool.com/earnings/call-transcripts... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | American Tower | q4-2020 | AMT | Real Estate | Specialized REITs | Boston, Massachusetts | 11/19/07 | 1995 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 312386 | https://www.fool.com/earnings/call-transcripts... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | Nvidia | q1-2020 | NVDA | Information Technology | Semiconductors | Santa Clara, California | 11/30/01 | 1993 |
| 312496 | https://www.fool.com/earnings/call-transcripts... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | Nvidia | q1-2022 | NVDA | Information Technology | Semiconductors | Santa Clara, California | 11/30/01 | 1993 |
| 312603 | https://www.fool.com/earnings/call-transcripts... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | Nvidia | q2-2021 | NVDA | Information Technology | Semiconductors | Santa Clara, California | 11/30/01 | 1993 |
| 312746 | https://www.fool.com/earnings/call-transcripts... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | Nvidia | q1-2021 | NVDA | Information Technology | Semiconductors | Santa Clara, California | 11/30/01 | 1993 |
| 312880 | https://www.fool.com/earnings/call-transcripts... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | Nvidia | q2-2020 | NVDA | Information Technology | Semiconductors | Santa Clara, California | 11/30/01 | 1993 |
2125 rows × 10 columns
Group data by Call to produce transcripts by quarter, ticker, and Q&A split.
df
| Unnamed: 0.1 | url | transcript | file | co_id | co_count | co_name | ticker_full | ticker | date | ... | close_82 | close_83 | close_84 | close_85 | close_86 | close_87 | close_88 | close_89 | close_90 | qa | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | https://www.fool.com/earnings/call-transcripts... | American Tower Corp (NYSE:AMT)Q1 2019 Earning... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 0 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| 1 | 2 | https://www.fool.com/earnings/call-transcripts... | Ladies and gentlemen, thank you for standing b... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 2 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| 2 | 4 | https://www.fool.com/earnings/call-transcripts... | Thanks, Kevin. Good morning and thank you for ... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 4 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| 3 | 5 | https://www.fool.com/earnings/call-transcripts... | We've posted a presentation, which we'll refer... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 5 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| 4 | 6 | https://www.fool.com/earnings/call-transcripts... | Before I begin, I'll remind you that this call... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 0 | 6 | American Tower Corp | NYSE:AMT | AMT | 2019-05-03 | ... | 197.260849 | 197.867722 | 197.068253 | NaN | NaN | 198.012222 | 200.978943 | 203.839783 | 203.637527 | pres |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 312976 | 508222 | https://www.fool.com/earnings/call-transcripts... | [Operator Closing Remarks] | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 138 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | qa |
| 312977 | 508223 | https://www.fool.com/earnings/call-transcripts... | Duration: 58 minutes | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 139 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | qa |
| 312978 | 508236 | https://www.fool.com/earnings/call-transcripts... | More NVDA analysis | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 152 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | qa |
| 312979 | 508237 | https://www.fool.com/earnings/call-transcripts... | They just revealed what they believe are the t... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 153 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | qa |
| 312980 | 508238 | https://www.fool.com/earnings/call-transcripts... | Market data powered by FactSet and Web Financi... | /Users/dfuent/Desktop/Desktop - David’s MacBoo... | 2114 | 154 | NVIDIA Corp | NASDAQ:NVDA | NVDA | 2019-08-16 | ... | 51.754051 | 51.931030 | 51.791439 | NaN | NaN | 51.891140 | 52.247593 | 51.988361 | 52.292454 | qa |
312981 rows × 202 columns
df = df[df.co_count != 0] # get first line for each call, which is the title
df_call = df.groupby(['ticker', 'quarter', 'qa']).transcript.apply(lambda x: ' '.join(x)).to_frame()
df_call
| transcript | |||
|---|---|---|---|
| ticker | quarter | qa | |
| A | q1-2019 | pres | Good day, ladies and gentlemen, and welcome to... |
| qa | Great. Thanks, guys. Maybe just to start -- [I... | ||
| q1-2020 | pres | Good afternoon and welcome to the Agilent Tech... | |
| qa | Hey, thanks. Appreciate you guys quantifying t... | ||
| q1-2021 | pres | Good afternoon and welcome to the Agilent Tech... | |
| ... | ... | ... | ... |
| ZTS | q4-2018 | qa | Thanks guys. Appreciate taking the call. Congr... |
| q4-2019 | pres | Good day and welcome to the Fourth Quarter and... | |
| qa | Hi, thanks for taking my questions and congrat... | ||
| q4-2020 | pres | Welcome to the Fourth Quarter and Full Year 20... | |
| qa | Thanks guys, good morning. Congrats on just a ... |
3679 rows × 1 columns
df_map = df[['qa', 'co_clean', 'ticker', 'date', 'quarter', 'close_0'] + close_list + date_list].drop_duplicates()
df_map.set_index(['ticker', 'qa', 'quarter'], inplace = True)
df_map
| co_clean | date | close_0 | close_1 | close_2 | close_3 | close_4 | close_5 | close_6 | close_7 | ... | date_81 | date_82 | date_83 | date_84 | date_85 | date_86 | date_87 | date_88 | date_89 | date_90 | |||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ticker | qa | quarter | |||||||||||||||||||||
| AMT | pres | q1-2019 | American Tower | 2019-05-03 | 185.057510 | NaN | NaN | 184.319000 | 182.084335 | 183.801117 | 184.645126 | 187.656662 | ... | 2019-07-23 | 2019-07-24 | 2019-07-25 | 2019-07-26 | 2019-07-27 | 2019-07-28 | 2019-07-29 | 2019-07-30 | 2019-07-31 | 2019-08-01 |
| qa | q1-2019 | American Tower | 2019-05-03 | 185.057510 | NaN | NaN | 184.319000 | 182.084335 | 183.801117 | 184.645126 | 187.656662 | ... | 2019-07-23 | 2019-07-24 | 2019-07-25 | 2019-07-26 | 2019-07-27 | 2019-07-28 | 2019-07-29 | 2019-07-30 | 2019-07-31 | 2019-08-01 | |
| pres | q4-2019 | American Tower | 2020-02-25 | 239.173203 | 235.801392 | 226.337082 | 220.380600 | NaN | NaN | 233.935760 | 235.781982 | ... | 2020-05-16 | 2020-05-17 | 2020-05-18 | 2020-05-19 | 2020-05-20 | 2020-05-21 | 2020-05-22 | 2020-05-23 | 2020-05-24 | 2020-05-25 | |
| qa | q4-2019 | American Tower | 2020-02-25 | 239.173203 | 235.801392 | 226.337082 | 220.380600 | NaN | NaN | 233.935760 | 235.781982 | ... | 2020-05-16 | 2020-05-17 | 2020-05-18 | 2020-05-19 | 2020-05-20 | 2020-05-21 | 2020-05-22 | 2020-05-23 | 2020-05-24 | 2020-05-25 | |
| pres | q4-2018 | American Tower | 2019-02-27 | 165.355408 | 168.171768 | 169.861633 | NaN | NaN | 171.551453 | 172.859406 | 173.422668 | ... | 2019-05-19 | 2019-05-20 | 2019-05-21 | 2019-05-22 | 2019-05-23 | 2019-05-24 | 2019-05-25 | 2019-05-26 | 2019-05-27 | 2019-05-28 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| NVDA | qa | q2-2021 | Nvidia | 2020-08-20 | 121.268181 | 126.686844 | NaN | NaN | 127.053909 | 127.351067 | 127.580795 | 126.134995 | ... | 2020-11-09 | 2020-11-10 | 2020-11-11 | 2020-11-12 | 2020-11-13 | 2020-11-14 | 2020-11-15 | 2020-11-16 | 2020-11-17 | 2020-11-18 |
| pres | q1-2021 | Nvidia | 2020-05-22 | 90.115929 | NaN | NaN | NaN | 87.035942 | 85.114075 | 84.732201 | 88.610886 | ... | 2020-08-11 | 2020-08-12 | 2020-08-13 | 2020-08-14 | 2020-08-15 | 2020-08-16 | 2020-08-17 | 2020-08-18 | 2020-08-19 | 2020-08-20 | |
| qa | q1-2021 | Nvidia | 2020-05-22 | 90.115929 | NaN | NaN | NaN | 87.035942 | 85.114075 | 84.732201 | 88.610886 | ... | 2020-08-11 | 2020-08-12 | 2020-08-13 | 2020-08-14 | 2020-08-15 | 2020-08-16 | 2020-08-17 | 2020-08-18 | 2020-08-19 | 2020-08-20 | |
| pres | q2-2020 | Nvidia | 2019-08-16 | 39.732742 | NaN | NaN | 42.526691 | 41.802059 | 42.638748 | 42.700996 | 40.449909 | ... | 2019-11-05 | 2019-11-06 | 2019-11-07 | 2019-11-08 | 2019-11-09 | 2019-11-10 | 2019-11-11 | 2019-11-12 | 2019-11-13 | 2019-11-14 | |
| qa | q2-2020 | Nvidia | 2019-08-16 | 39.732742 | NaN | NaN | 42.526691 | 41.802059 | 42.638748 | 42.700996 | 40.449909 | ... | 2019-11-05 | 2019-11-06 | 2019-11-07 | 2019-11-08 | 2019-11-09 | 2019-11-10 | 2019-11-11 | 2019-11-12 | 2019-11-13 | 2019-11-14 |
3719 rows × 183 columns
Merge price info to call data:
df_call = df_call.merge(df_map, on = ['ticker', 'quarter', 'qa'], how = 'left')
df_call
| transcript | co_clean | date | close_0 | close_1 | close_2 | close_3 | close_4 | close_5 | close_6 | ... | date_81 | date_82 | date_83 | date_84 | date_85 | date_86 | date_87 | date_88 | date_89 | date_90 | |||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ticker | quarter | qa | |||||||||||||||||||||
| A | q1-2019 | pres | Good day, ladies and gentlemen, and welcome to... | Agilent Technologies | 2019-02-21 | 76.392479 | 76.912292 | NaN | NaN | 77.814606 | 77.039795 | 77.893066 | ... | 2019-05-13 | 2019-05-14 | 2019-05-15 | 2019-05-16 | 2019-05-17 | 2019-05-18 | 2019-05-19 | 2019-05-20 | 2019-05-21 | 2019-05-22 |
| qa | Great. Thanks, guys. Maybe just to start -- [I... | Agilent Technologies | 2019-02-21 | 76.392479 | 76.912292 | NaN | NaN | 77.814606 | 77.039795 | 77.893066 | ... | 2019-05-13 | 2019-05-14 | 2019-05-15 | 2019-05-16 | 2019-05-17 | 2019-05-18 | 2019-05-19 | 2019-05-20 | 2019-05-21 | 2019-05-22 | ||
| q1-2020 | pres | Good afternoon and welcome to the Agilent Tech... | Agilent Technologies | 2020-02-18 | 83.869682 | 84.423607 | 83.424568 | 84.146637 | NaN | NaN | 79.626244 | ... | 2020-05-09 | 2020-05-10 | 2020-05-11 | 2020-05-12 | 2020-05-13 | 2020-05-14 | 2020-05-15 | 2020-05-16 | 2020-05-17 | 2020-05-18 | |
| qa | Hey, thanks. Appreciate you guys quantifying t... | Agilent Technologies | 2020-02-18 | 83.869682 | 84.423607 | 83.424568 | 84.146637 | NaN | NaN | 79.626244 | ... | 2020-05-09 | 2020-05-10 | 2020-05-11 | 2020-05-12 | 2020-05-13 | 2020-05-14 | 2020-05-15 | 2020-05-16 | 2020-05-17 | 2020-05-18 | ||
| q1-2021 | pres | Good afternoon and welcome to the Agilent Tech... | Agilent Technologies | 2021-02-16 | 127.587898 | 129.751755 | 127.318665 | 126.510956 | NaN | NaN | 123.150490 | ... | 2021-05-08 | 2021-05-09 | 2021-05-10 | 2021-05-11 | 2021-05-12 | 2021-05-13 | 2021-05-14 | 2021-05-15 | 2021-05-16 | 2021-05-17 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| ZTS | q4-2018 | qa | Thanks guys. Appreciate taking the call. Congr... | Zoetis | 2019-02-14 | 91.927254 | 94.007095 | NaN | NaN | NaN | 93.553688 | 93.041100 | ... | 2019-05-06 | 2019-05-07 | 2019-05-08 | 2019-05-09 | 2019-05-10 | 2019-05-11 | 2019-05-12 | 2019-05-13 | 2019-05-14 | 2019-05-15 |
| q4-2019 | pres | Good day and welcome to the Fourth Quarter and... | Zoetis | 2020-02-13 | 143.703262 | 142.939804 | NaN | NaN | NaN | 142.424271 | 142.830765 | ... | 2020-05-04 | 2020-05-05 | 2020-05-06 | 2020-05-07 | 2020-05-08 | 2020-05-09 | 2020-05-10 | 2020-05-11 | 2020-05-12 | 2020-05-13 | |
| qa | Hi, thanks for taking my questions and congrat... | Zoetis | 2020-02-13 | 143.703262 | 142.939804 | NaN | NaN | NaN | 142.424271 | 142.830765 | ... | 2020-05-04 | 2020-05-05 | 2020-05-06 | 2020-05-07 | 2020-05-08 | 2020-05-09 | 2020-05-10 | 2020-05-11 | 2020-05-12 | 2020-05-13 | ||
| q4-2020 | pres | Welcome to the Fourth Quarter and Full Year 20... | Zoetis | 2021-02-16 | 165.862793 | 168.435669 | 167.488297 | 159.560150 | NaN | NaN | 159.719711 | ... | 2021-05-08 | 2021-05-09 | 2021-05-10 | 2021-05-11 | 2021-05-12 | 2021-05-13 | 2021-05-14 | 2021-05-15 | 2021-05-16 | 2021-05-17 | |
| qa | Thanks guys, good morning. Congrats on just a ... | Zoetis | 2021-02-16 | 165.862793 | 168.435669 | 167.488297 | 159.560150 | NaN | NaN | 159.719711 | ... | 2021-05-08 | 2021-05-09 | 2021-05-10 | 2021-05-11 | 2021-05-12 | 2021-05-13 | 2021-05-14 | 2021-05-15 | 2021-05-16 | 2021-05-17 |
3719 rows × 184 columns
df_call.to_csv('../proj_data/output_data/CALL.csv')
Group data by Speaker and Call to produce transcripts by speaker, quarter, ticker, and Q&A split. This should have each handoff between speaker in each call.
df_speaker = df.groupby(OHCO[:4]).transcript.apply(lambda x: ' '.join(x)).to_frame()
df_speaker
| transcript | ||||
|---|---|---|---|---|
| ticker | speaker | quarter | qa | |
| A | Andrew Obin | q1-2021 | pres | Good afternoon and welcome to the Agilent Tech... |
| Ankur Dhingra | q1-2019 | pres | Thank you. And welcome, everyone, to Agilent's... | |
| qa | All right. Duration: 69 minutes | |||
| q1-2020 | pres | Thank you, Jillian. Welcome everyone to Agilen... | ||
| qa | All right. Thanks everyone. With that, we woul... | |||
| ... | ... | ... | ... | ... |
| ZTS | Steven Frank | q1-2021 | pres | Thank you, Keith. Good morning, everyone, and ... |
| q3-2019 | pres | Good morning everyone and welcome to the Zoeti... | ||
| q3-2020 | pres | Thank you, Keith. Good morning, everyone, and ... | ||
| Thomas Chiu | q2-2019 | qa | Hi. Thanks for taking my questions. This is Th... | |
| Vijay Jayant | q1-2021 | pres | Welcome to the First Quarter 2021 Financial Re... |
33151 rows × 1 columns
df_speaker.to_csv('../proj_data/output_data/speaker_call.csv')
Group data by company: just a big string of calls per ticker (probably not too important).
df_co = df.groupby('ticker').transcript.apply(lambda x: ' '.join(x)).to_frame()
df_co
| transcript | |
|---|---|
| ticker | |
| A | Good afternoon and welcome to the Agilent Tech... |
| AAPL | Good day and welcome to the Apple Incorporated... |
| ABBV | Good morning and thank you for standing by. We... |
| ABT | See all our earnings call transcripts. Good mo... |
| ACN | Ladies and gentlemen, thank you for standing b... |
| ... | ... |
| WM | Ladies and gentlemen, thank you for standing b... |
| WMT | Greetings. Welcome to Walmart's Fiscal 2022 Fi... |
| XEL | Good day, ladies and gentlemen, and welcome to... |
| XOM | Good day, everyone. Welcome to this ExxonMobil... |
| ZTS | Welcome to the Third Quarter 2019 Financial Re... |
196 rows × 1 columns
df_co.to_csv('../proj_data/output_data/COMPANY.csv')
Group data by Speaker and add in Q&A:
df_transition = df.groupby(OHCO[:3]+['co_count', 'qa']).transcript.apply(lambda x: ' '.join(x)).to_frame()
df_transition
| transcript | |||||
|---|---|---|---|---|---|
| ticker | speaker | quarter | co_count | qa | |
| A | Andrew Obin | q1-2021 | 2 | pres | Good afternoon and welcome to the Agilent Tech... |
| 3 | pres | They just revealed what they believe are the t... | |||
| Ankur Dhingra | q1-2019 | 4 | pres | Thank you. And welcome, everyone, to Agilent's... | |
| 5 | pres | You can find the press release, investor prese... | |||
| 6 | pres | They just revealed what they believe are the t... | |||
| ... | ... | ... | ... | ... | ... |
| ZTS | Thomas Chiu | q2-2019 | 157 | qa | Hi. Thanks for taking my questions. This is Th... |
| 158 | qa | Second question is, if you could comment on ot... | |||
| Vijay Jayant | q1-2021 | 2 | pres | Welcome to the First Quarter 2021 Financial Re... | |
| 3 | pres | It is now my pleasure to turn the floor over t... | |||
| 4 | pres | They just revealed what they believe are the t... |
276687 rows × 1 columns
df_transition.to_csv('../proj_data/output_data/transition_call.csv')
import time
t1 = time.time()
df_transition.index
MultiIndex([( 'A', 'Andrew Obin', 'q1-2021', 2, 'pres'),
( 'A', 'Andrew Obin', 'q1-2021', 3, 'pres'),
( 'A', 'Ankur Dhingra', 'q1-2019', 4, 'pres'),
( 'A', 'Ankur Dhingra', 'q1-2019', 5, 'pres'),
( 'A', 'Ankur Dhingra', 'q1-2019', 6, 'pres'),
( 'A', 'Ankur Dhingra', 'q1-2019', 7, 'pres'),
( 'A', 'Ankur Dhingra', 'q1-2019', 8, 'pres'),
( 'A', 'Ankur Dhingra', 'q1-2019', 46, 'pres'),
( 'A', 'Ankur Dhingra', 'q1-2019', 48, 'pres'),
( 'A', 'Ankur Dhingra', 'q1-2019', 380, 'qa'),
...
('ZTS', 'Steven Frank', 'q3-2019', 5, 'pres'),
('ZTS', 'Steven Frank', 'q3-2019', 6, 'pres'),
('ZTS', 'Steven Frank', 'q3-2019', 7, 'pres'),
('ZTS', 'Steven Frank', 'q3-2020', 5, 'pres'),
('ZTS', 'Steven Frank', 'q3-2020', 6, 'pres'),
('ZTS', 'Thomas Chiu', 'q2-2019', 157, 'qa'),
('ZTS', 'Thomas Chiu', 'q2-2019', 158, 'qa'),
('ZTS', 'Vijay Jayant', 'q1-2021', 2, 'pres'),
('ZTS', 'Vijay Jayant', 'q1-2021', 3, 'pres'),
('ZTS', 'Vijay Jayant', 'q1-2021', 4, 'pres')],
names=['ticker', 'speaker', 'quarter', 'co_count', 'qa'], length=276687)
df_transition.index.get_level_values('co_count')
Int64Index([ 2, 3, 4, 5, 6, 7, 8, 46, 48, 380,
...
5, 6, 7, 5, 6, 157, 158, 2, 3, 4],
dtype='int64', name='co_count', length=276687)
Create tokens from each call:
Tokenize even lines (tokenizing all at once gave me memory issues, so I split to odds and evens):
df_tok = df_transition['transcript'][df_transition.index.get_level_values('co_count') % 2 == 0].str.split(r"[\s,-]+", expand=True).stack()\
.to_frame().rename(columns={0:'token_str'})
print(time.time()-t1)
218.3086760044098
df_tok
| token_str | ||||||
|---|---|---|---|---|---|---|
| ticker | speaker | quarter | co_count | qa | ||
| A | Andrew Obin | q1-2021 | 2 | pres | 0 | Good |
| 1 | afternoon | |||||
| 2 | and | |||||
| 3 | welcome | |||||
| 4 | to | |||||
| ... | ... | ... | ... | ... | ... | ... |
| ZTS | Vijay Jayant | q1-2021 | 4 | pres | 30 | stocks |
| 31 | are | |||||
| 32 | even | |||||
| 33 | better | |||||
| 34 | buys. |
10227625 rows × 1 columns
Tokenize odd lines:
#%%time
df_tok2 = df_transition['transcript'][df_transition.index.get_level_values('co_count') % 2 == 1].str.split(r"[\s,-]+", expand=True).stack()\
.to_frame().rename(columns={0:'token_str'})
print(time.time()-t1)
298.99941396713257
df_tokens = df_tok.append(df_tok2) # append the token tables
df_tokens
| token_str | ||||||
|---|---|---|---|---|---|---|
| ticker | speaker | quarter | co_count | qa | ||
| A | Andrew Obin | q1-2021 | 2 | pres | 0 | Good |
| 1 | afternoon | |||||
| 2 | and | |||||
| 3 | welcome | |||||
| 4 | to | |||||
| ... | ... | ... | ... | ... | ... | ... |
| ZTS | Vijay Jayant | q1-2021 | 3 | pres | 12 | Frank. |
| 13 | Steve | |||||
| 14 | you | |||||
| 15 | may | |||||
| 16 | begin. |
20315648 rows × 1 columns
Create terms:
df_tokens['term_str'] = df_tokens['token_str'].str.lower().str.replace('[\W_]', '')
df_tokens = df_tokens[df_tokens['term_str']!='']
df_tokens
| token_str | term_str | ||||||
|---|---|---|---|---|---|---|---|
| ticker | speaker | quarter | co_count | qa | |||
| A | Andrew Obin | q1-2021 | 2 | pres | 0 | Good | good |
| 1 | afternoon | afternoon | |||||
| 2 | and | and | |||||
| 3 | welcome | welcome | |||||
| 4 | to | to | |||||
| ... | ... | ... | ... | ... | ... | ... | ... |
| ZTS | Vijay Jayant | q1-2021 | 3 | pres | 12 | Frank. | frank |
| 13 | Steve | steve | |||||
| 14 | you | you | |||||
| 15 | may | may | |||||
| 16 | begin. | begin |
20313409 rows × 2 columns
Create VOCAB table:
VOCAB = df_tokens.term_str.value_counts().to_frame().rename(columns={'index':'term_str', 'term_str':'n'})\
.sort_index().reset_index().rename(columns={'index':'term_str'})
VOCAB.index.name = 'term_id'
VOCAB
| term_str | n | |
|---|---|---|
| term_id | ||
| 0 | 0 | 297 |
| 1 | 00 | 7 |
| 2 | 000 | 7099 |
| 3 | 00000 | 6 |
| 4 | 00001 | 3 |
| ... | ... | ... |
| 53139 | zürich | 4 |
| 53140 | ½ | 2 |
| 53141 | à | 28 |
| 53142 | élysées | 1 |
| 53143 | éléonore | 74 |
53144 rows × 2 columns
VOCAB['pos_tup'] = VOCAB['term_str'].apply(lambda r : nltk.pos_tag([r])[0]) # POS addition
VOCAB
| term_str | n | pos_tup | |
|---|---|---|---|
| term_id | |||
| 0 | 0 | 297 | (0, CD) |
| 1 | 00 | 7 | (00, CD) |
| 2 | 000 | 7099 | (000, CD) |
| 3 | 00000 | 6 | (00000, CD) |
| 4 | 00001 | 3 | (00001, CD) |
| ... | ... | ... | ... |
| 53139 | zürich | 4 | (zürich, NN) |
| 53140 | ½ | 2 | (½, NN) |
| 53141 | à | 28 | (à, NN) |
| 53142 | élysées | 1 | (élysées, NN) |
| 53143 | éléonore | 74 | (éléonore, NN) |
53144 rows × 3 columns
VOCAB['num'] = VOCAB.term_str.str.match("\d+").astype('int')
Get just the POS
pos_dict = dict(zip(VOCAB.term_str, VOCAB.pos_tup))
df_tokens['pos_tup'] = df_tokens['term_str'].map(pos_dict)
df_tokens['pos'] = df_tokens.pos_tup.apply(lambda x: x[1])
df_tokens
| token_str | term_str | pos_tup | pos | ||||||
|---|---|---|---|---|---|---|---|---|---|
| ticker | speaker | quarter | co_count | qa | |||||
| A | Andrew Obin | q1-2021 | 2 | pres | 0 | Good | good | (good, JJ) | JJ |
| 1 | afternoon | afternoon | (afternoon, NN) | NN | |||||
| 2 | and | and | (and, CC) | CC | |||||
| 3 | welcome | welcome | (welcome, NN) | NN | |||||
| 4 | to | to | (to, TO) | TO | |||||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| ZTS | Vijay Jayant | q1-2021 | 3 | pres | 12 | Frank. | frank | (frank, NN) | NN |
| 13 | Steve | steve | (steve, NN) | NN | |||||
| 14 | you | you | (you, PRP) | PRP | |||||
| 15 | may | may | (may, MD) | MD | |||||
| 16 | begin. | begin | (begin, NN) | NN |
20313409 rows × 4 columns
Get MAX POS since the POS tagging per word might be wrong. Hopefully using MAX averages out the wrong POS taggings.
df_tokens['num'] = df_tokens.term_str.str.match("\d+").astype('int')
pos_count = df_tokens.groupby(['term_str', 'pos'], as_index=False)['pos_tup'].count()\
.rename(columns = {'pos_tup':'count'})
idx = pos_count.groupby(['term_str'])['count'].transform(max) == pos_count['count']
max_map = pos_count[idx].drop(['count'], axis = 1)
Remove stop words:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1
VOCAB['stop'] = VOCAB.term_str.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int') # fill any untagged as 0, meaning not stop words
df_tokens['stop'] = df_tokens.term_str.map(sw.dummy)
df_tokens['stop'] = df_tokens['stop'].fillna(0).astype('int') # fill any untagged as 0, meaning not stop words
Stem the terms and add to the VOCAB and TOKEN tables:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
VOCAB['p_stem'] = VOCAB.term_str.apply(stemmer.stem)
stem_dict = dict(zip(VOCAB.term_str, VOCAB.p_stem))
df_tokens['p_stem'] = df_tokens['term_str'].map(stem_dict)
max_map = dict(max_map[['term_str', 'pos']].values)
VOCAB['max_pos'] = VOCAB.term_str.map(max_map)
VOCAB
| term_str | n | pos_tup | num | stop | p_stem | max_pos | |
|---|---|---|---|---|---|---|---|
| term_id | |||||||
| 0 | 0 | 297 | (0, CD) | 1 | 0 | 0 | CD |
| 1 | 00 | 7 | (00, CD) | 1 | 0 | 00 | CD |
| 2 | 000 | 7099 | (000, CD) | 1 | 0 | 000 | CD |
| 3 | 00000 | 6 | (00000, CD) | 1 | 0 | 00000 | CD |
| 4 | 00001 | 3 | (00001, CD) | 1 | 0 | 00001 | CD |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 53139 | zürich | 4 | (zürich, NN) | 0 | 0 | zürich | NN |
| 53140 | ½ | 2 | (½, NN) | 0 | 0 | ½ | NN |
| 53141 | à | 28 | (à, NN) | 0 | 0 | à | NN |
| 53142 | élysées | 1 | (élysées, NN) | 0 | 0 | élysé | NN |
| 53143 | éléonore | 74 | (éléonore, NN) | 0 | 0 | éléonor | NN |
53144 rows × 7 columns
df_tokens['max_pos'] = df_tokens.term_str.map(max_map)
df_tokens.sample(10)
| token_str | term_str | pos_tup | pos | num | stop | p_stem | max_pos | ||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ticker | speaker | quarter | co_count | qa | |||||||||
| ACN | KC McClure | q1-2021 | 20 | pres | 62 | the | the | (the, DT) | DT | 0 | 1 | the | DT |
| MMM | Nicholas Gangestad | q3-2019 | 167 | qa | 37 | point | point | (point, NN) | NN | 0 | 0 | point | NN |
| ADBE | Saket Kalia | q1-2018 | 159 | qa | 7 | of | of | (of, IN) | IN | 0 | 1 | of | IN |
| KMB | Andrea Teixeira | q2-2021 | 198 | qa | 8 | in | in | (in, IN) | IN | 0 | 1 | in | IN |
| MU | Sanjay Mehrotra | q1-2020 | 26 | pres | 63 | SATA | sata | (sata, NNS) | NNS | 0 | 0 | sata | NNS |
| AMT | Batya Levi | q1-2019 | 104 | qa | 2 | Then | then | (then, RB) | RB | 0 | 1 | then | RB |
| PLD | Thomas S. Olinger | q4-2018 | 25 | pres | 90 | 2019 | 2019 | (2019, CD) | CD | 1 | 0 | 2019 | CD |
| LMT | James D. Taiclet | q1-2021 | 33 | pres | 74 | the | the | (the, DT) | DT | 0 | 1 | the | DT |
| ALXN | Brian Goff | q4-2018 | 141 | qa | 10 | the | the | (the, DT) | DT | 0 | 1 | the | DT |
| AMZN | Brian T. Olsavsky | q3-2019 | 92 | qa | 13 | third | third | (third, JJ) | JJ | 0 | 0 | third | JJ |
df_tokens
| token_str | term_str | pos_tup | pos | num | stop | p_stem | max_pos | ||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ticker | speaker | quarter | co_count | qa | |||||||||
| A | Andrew Obin | q1-2021 | 2 | pres | 0 | Good | good | (good, JJ) | JJ | 0 | 0 | good | JJ |
| 1 | afternoon | afternoon | (afternoon, NN) | NN | 0 | 0 | afternoon | NN | |||||
| 2 | and | and | (and, CC) | CC | 0 | 1 | and | CC | |||||
| 3 | welcome | welcome | (welcome, NN) | NN | 0 | 0 | welcom | NN | |||||
| 4 | to | to | (to, TO) | TO | 0 | 1 | to | TO | |||||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| ZTS | Vijay Jayant | q1-2021 | 3 | pres | 12 | Frank. | frank | (frank, NN) | NN | 0 | 0 | frank | NN |
| 13 | Steve | steve | (steve, NN) | NN | 0 | 0 | steve | NN | |||||
| 14 | you | you | (you, PRP) | PRP | 0 | 1 | you | PRP | |||||
| 15 | may | may | (may, MD) | MD | 0 | 0 | may | MD | |||||
| 16 | begin. | begin | (begin, NN) | NN | 0 | 0 | begin | NN |
20313409 rows × 8 columns
Send TOKEN and VOCAB tables to CSVs.
df_tokens.to_csv('../proj_data/output_data/TOKENS.csv')
VOCAB.to_csv('../proj_data/output_data/VOCAB.csv')
Purpose: This section runs various analyses, such as TF-IDF, PCA, topic modeling, emotion/sentiment analysis, etc.
from scipy.linalg import norm
import pandas as pd
import numpy as np
import seaborn as sns
import plotly_express as px
def tfidf(tokens_df, ohco_lev, count_type, tf_type, idf_type):
'''
a function to return the TFIDF matrix with given tokens DF,
OHCO level, count type, term-frequency type, and IDF type.
'''
# token cleanup: remove NA values
tokens_df = tokens_df[~tokens_df.term_str.isna()]
# set BoW by grouping OHCO and term_str in the tokens DF
BOW = tokens_df.groupby(ohco_lev+['term_str']).term_str.count()\
.to_frame().rename(columns={'term_str':'n'})
# create DTCM
DTCM = BOW[count_type].unstack().fillna(0).astype('int')
# some if-else statements help us handle the potential inputs
if tf_type == 'sum':
TF = DTCM.T / DTCM.T.sum()
elif tf_type == 'max':
TF = DTCM.T / DTCM.T.max()
elif tf_type == 'log':
TF = np.log10(1 + DTCM.T)
elif tf_type == 'raw':
TF =tf_typeDTCM.T
elif tf_type == 'double_norm':
TF = DTCM.T / DTCM.T.max()
TF = tf_norm_k + (1 - tf_norm_k) * TF[TF > 0] # EXPLAIN; may defeat purpose of norming
elif tf_type == 'binary':
TF = DTCM.T.astype('bool').astype('int')
# transpose the term-frequency matrix
TF = TF.T
# create the document frequency matrix
DF = DTCM[DTCM > 0].count()
N = DTCM.shape[0]
# calc the IDF depending on the inputs
if idf_type == 'standard':
IDF = np.log10(N / DF)
elif idf_type == 'max':
IDF = np.log10(DF.max() / DF)
elif idf_type == 'smooth':
IDF = np.log10((1 + N) / (1 + DF)) + 1 # Correct?
# return the final TF-IDF table. Rows should rep the OHCO level; columns should rep the terms
return TF * IDF
count_method = 'n' # 'c' or 'n' # n = n tokens, c = distinct token (term) count
tf_method = 'sum' # sum, max, log, double_norm, raw, binary
tf_norm_k = .5 # only used for double_norm
idf_method = 'standard' # standard, max, smooth
OHCO = ['ticker', 'speaker', 'quarter', 'qa', 'co_count']
sns.set()
%matplotlib inline
Read in TOKEN and VOCAB tables:
%%time
# only need the TOKEN table for this
TOKEN = pd.read_csv('../proj_data/output_data/TOKENS.csv').set_index(OHCO)#.drop('Unnamed: 4', axis = 1)
CPU times: user 26.5 s, sys: 12.8 s, total: 39.3 s Wall time: 46.7 s
VOCAB = pd.read_csv('../proj_data/output_data/VOCAB.csv', encoding='latin1')
TOKEN
| Unnamed: 5 | token_str | term_str | pos_tup | pos | num | stop | p_stem | max_pos | |||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ticker | speaker | quarter | qa | co_count | |||||||||
| A | Andrew Obin | q1-2021 | pres | 2 | 0 | Good | good | ('good', 'JJ') | JJ | 0 | 0 | good | JJ |
| 2 | 1 | afternoon | afternoon | ('afternoon', 'NN') | NN | 0 | 0 | afternoon | NN | ||||
| 2 | 2 | and | and | ('and', 'CC') | CC | 0 | 1 | and | CC | ||||
| 2 | 3 | welcome | welcome | ('welcome', 'NN') | NN | 0 | 0 | welcom | NN | ||||
| 2 | 4 | to | to | ('to', 'TO') | TO | 0 | 1 | to | TO | ||||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| ZTS | Vijay Jayant | q1-2021 | pres | 3 | 12 | Frank. | frank | ('frank', 'NN') | NN | 0 | 0 | frank | NN |
| 3 | 13 | Steve | steve | ('steve', 'NN') | NN | 0 | 0 | steve | NN | ||||
| 3 | 14 | you | you | ('you', 'PRP') | PRP | 0 | 1 | you | PRP | ||||
| 3 | 15 | may | may | ('may', 'MD') | MD | 0 | 0 | may | MD | ||||
| 3 | 16 | begin. | begin | ('begin', 'NN') | NN | 0 | 0 | begin | NN |
20313409 rows × 9 columns
tfidf(TOKEN, ['ticker'], 'n', 'sum', 'standard').head(5)
| term_str | 0 | 00 | 000 | 00000 | 00001 | 00003 | 0001 | 0002 | 0003 | 0004 | ... | zyrtec | zytel | zytiga | zz | zzzquil | zürich | ½ | à | élysées | éléonore |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ticker | |||||||||||||||||||||
| A | 0.000002 | 0.0 | 1.304626e-07 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 |
| AAPL | 0.000005 | 0.0 | 1.257671e-06 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000028 | 0.0 |
| ABBV | 0.000000 | 0.0 | 6.164837e-07 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 |
| ABT | 0.000000 | 0.0 | 7.412701e-07 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 |
| ACN | 0.000005 | 0.0 | 2.204425e-06 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 |
5 rows × 53144 columns
tfidf(TOKEN, ['ticker'], 'n', 'sum', 'standard').sum().sort_values(ascending = False)[:20]
term_str patients 0.035506 5g 0.019694 clients 0.018365 cloud 0.017604 clinical 0.016187 iqos 0.015769 nand 0.015631 stores 0.014643 dram 0.013908 client 0.013368 fiscal 0.013156 nike 0.012603 slide 0.012387 ultomiris 0.012365 invisalign 0.012355 intermodal 0.012003 jim 0.010877 deposits 0.010684 automotive 0.010519 bookings 0.010245 dtype: float64
tfidf(TOKEN, ['ticker', 'quarter'], 'n', 'sum', 'standard').head(5)
| term_str | 0 | 00 | 000 | 00000 | 00001 | 00003 | 0001 | 0002 | 0003 | 0004 | ... | zyrtec | zytel | zytiga | zz | zzzquil | zürich | ½ | à | élysées | éléonore | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ticker | quarter | |||||||||||||||||||||
| A | q1-2019 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| q1-2020 | 0.0 | 0.0 | 0.000015 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| q1-2021 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| q2-2019 | 0.0 | 0.0 | 0.000014 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| q2-2020 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 53144 columns
tfidf(TOKEN, ['ticker', 'quarter'], 'n', 'sum', 'standard').sum().sort_values(ascending = False)[:20]
term_str patients 0.529191 covid 0.443773 cloud 0.380873 clients 0.349586 fiscal 0.339953 5g 0.279448 stores 0.265252 slide 0.253180 china 0.245150 pandemic 0.240004 morning 0.233094 store 0.227817 clinical 0.211744 content 0.208658 organic 0.206872 software 0.206782 mobile 0.206225 client 0.203558 digital 0.201301 customers 0.201043 dtype: float64
tok_red = TOKEN[(TOKEN.stop == 0) & (TOKEN.num == 0)] # reduced TOKEN table; remove stop words and numbers
tok_red.sample(10)
| Unnamed: 5 | token_str | term_str | pos_tup | pos | num | stop | p_stem | max_pos | |||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ticker | speaker | quarter | qa | co_count | |||||||||
| JCI | George Oliver | q1-2019 | pres | 15 | 157 | across | across | ('across', 'IN') | IN | 0 | 0 | across | IN |
| GIS | John Baumgartner | q1-2021 | qa | 216 | 45 | scraping | scraping | ('scraping', 'VBG') | VBG | 0 | 0 | scrape | VBG |
| CSX | James M. Foote | q1-2019 | pres | 13 | 40 | last | last | ('last', 'JJ') | JJ | 0 | 0 | last | JJ |
| MET | Michel A. Khalaf | q2-2020 | qa | 144 | 143 | revenues. | revenues | ('revenues', 'NNS') | NNS | 0 | 0 | revenu | NNS |
| KLAC | Patrick Ho | q3-2019 | qa | 164 | 1 | next | next | ('next', 'JJ') | JJ | 0 | 0 | next | JJ |
| ALXN | Brian Goff | q1-2019 | qa | 115 | 4 | morning. | morning | ('morning', 'NN') | NN | 0 | 0 | morn | NN |
| SYK | Preston Wells | q1-2021 | pres | 12 | 26 | Medical | medical | ('medical', 'JJ') | JJ | 0 | 0 | medic | JJ |
| AMT | Tom Bartlett | q1-2021 | qa | 78 | 3 | see | see | ('see', 'VB') | VB | 0 | 0 | see | VB |
| SBUX | Patrick J. Grismer | q4-2019 | pres | 64 | 78 | final | final | ('final', 'JJ') | JJ | 0 | 0 | final | JJ |
| ABBV | Michael E. Severino | q3-2019 | qa | 57 | 7 | PsA | psa | ('psa', 'NN') | NN | 0 | 0 | psa | NN |
Run TFIDF on the reduced TOKEN table and show important words:
tfidf(tok_red, ['ticker'], 'n', 'sum', 'standard').sum().sort_values(ascending = False)[:20]
term_str patients 0.068309 clients 0.035439 cloud 0.033663 clinical 0.031034 iqos 0.030714 nand 0.029648 stores 0.028463 dram 0.026407 client 0.025927 fiscal 0.025469 slide 0.024097 intermodal 0.023752 ultomiris 0.023725 invisalign 0.023585 nike 0.023095 jim 0.021163 deposits 0.020928 automotive 0.020632 wireless 0.020139 bookings 0.019953 dtype: float64
DTCM:
BOW = tok_red.groupby(['quarter', 'ticker', 'term_str']).term_str.count()\
.to_frame().rename(columns={'term_str':'n'})
BOW #show bag of words
| n | |||
|---|---|---|---|
| quarter | ticker | term_str | |
| q1-2018 | ADBE | ability | 5 |
| able | 4 | ||
| accelerate | 2 | ||
| accentures | 1 | ||
| access | 1 | ||
| ... | ... | ... | ... |
| q4-2021 | TJX | yih | 1 |
| youd | 1 | ||
| youll | 3 | ||
| youre | 11 | ||
| youve | 2 |
2686274 rows × 1 columns
count_method = 'n' # 'c' or 'n' # n = n tokens, c = distinct token (term) count
tf_method = 'sum' # sum, max, log, double_norm, raw, binary
tf_norm_k = .5 # only used for double_norm
idf_method = 'standard' # standard, max, smooth
DTCM = BOW[count_method].unstack().fillna(0).astype('int')
DTCM
| term_str | a000 | a006 | a10 | a100 | a100s | a12 | a12x | a13 | a14 | a1c | ... | zyrtec | zytel | zytiga | zz | zzzquil | zürich | ½ | à | élysées | éléonore | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| quarter | ticker | |||||||||||||||||||||
| q1-2018 | ADBE | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| BK | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | |
| CMCSA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
| DD | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
| GOOGL | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| q4-2021 | NKE | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| NVDA | 0 | 0 | 0 | 9 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
| ORCL | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
| STZ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
| TJX | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1841 rows × 49376 columns
TFIDF_red = tfidf(tok_red, ['quarter', 'ticker'], 'n', 'sum', 'standard') # run function
TFIDF_red
| term_str | a000 | a006 | a10 | a100 | a100s | a12 | a12x | a13 | a14 | a1c | ... | zyrtec | zytel | zytiga | zz | zzzquil | zürich | ½ | à | élysées | éléonore | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| quarter | ticker | |||||||||||||||||||||
| q1-2018 | ADBE | 0.0 | 0.0 | 0.0 | 0.00000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 |
| BK | 0.0 | 0.0 | 0.0 | 0.00000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000439 | 0.0 | 0.0 | |
| CMCSA | 0.0 | 0.0 | 0.0 | 0.00000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | |
| DD | 0.0 | 0.0 | 0.0 | 0.00000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | |
| GOOGL | 0.0 | 0.0 | 0.0 | 0.00000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| q4-2021 | NKE | 0.0 | 0.0 | 0.0 | 0.00000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 |
| NVDA | 0.0 | 0.0 | 0.0 | 0.00506 | 0.000649 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | |
| ORCL | 0.0 | 0.0 | 0.0 | 0.00000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | |
| STZ | 0.0 | 0.0 | 0.0 | 0.00000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | |
| TJX | 0.0 | 0.0 | 0.0 | 0.00000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 |
1841 rows × 49376 columns
Get top terms from the TFIDF output and start eigen value and vector creation for PCA:
top = TFIDF_red.sum().sort_values(ascending = False)[:5000] # sort and get the top 5000 words by TFI-IDF sum
top_list = list(top.index) # turn to list
top_cols = [i for i in TFIDF_red.columns if i in top_list] # return the top columns if they're in the list
TFIDF_red = TFIDF_red[top_cols] # index the TFIDF by the top columns (so it now contains the 5000 terms)
TFIDF_red
| term_str | aart | aat | abaxis | abbott | abbvie | ability | able | absence | absent | absolute | ... | york | youd | youll | young | younger | youtube | z | zero | zoetis | zone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| quarter | ticker | |||||||||||||||||||||
| q1-2018 | ADBE | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000059 | 0.000023 | 0.000000 | 0.0 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000115 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 |
| BK | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000010 | 0.000010 | 0.000000 | 0.0 | 0.000000 | ... | 0.000305 | 0.000000 | 0.000100 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | |
| CMCSA | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000107 | 0.000015 | 0.000000 | 0.0 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000342 | 0.000000 | 0.000000 | 0.0 | 0.0 | |
| DD | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000010 | 0.000010 | 0.000208 | 0.0 | 0.000000 | ... | 0.000000 | 0.000086 | 0.000164 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | |
| GOOGL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000010 | 0.000019 | 0.000000 | 0.0 | 0.000000 | ... | 0.000150 | 0.000172 | 0.000066 | 0.000000 | 0.0 | 0.009086 | 0.000000 | 0.000000 | 0.0 | 0.0 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| q4-2021 | NKE | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000022 | 0.000015 | 0.000000 | 0.0 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000070 | 0.000000 | 0.0 | 0.000000 | 0.001352 | 0.000000 | 0.0 | 0.0 |
| NVDA | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000043 | 0.000010 | 0.000000 | 0.0 | 0.000144 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000343 | 0.000000 | 0.000383 | 0.0 | 0.0 | |
| ORCL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000009 | 0.000000 | 0.0 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000064 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000235 | 0.0 | 0.0 | |
| STZ | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000007 | 0.000000 | 0.0 | 0.000000 | ... | 0.000000 | 0.000127 | 0.000097 | 0.000163 | 0.0 | 0.000239 | 0.001177 | 0.000000 | 0.0 | 0.0 | |
| TJX | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000010 | 0.000038 | 0.000000 | 0.0 | 0.000000 | ... | 0.000000 | 0.000084 | 0.000097 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 |
1841 rows × 5000 columns
TFIDF_red = TFIDF_red.apply(lambda x: x / norm(x), 1) # normalize the table
TFIDF_red.to_csv('../proj_data/output_data/reduced_TFIDF.csv') # send to CSV
COV = TFIDF_red.cov() # create covariance matrix
COV.iloc[:5,:10].style.background_gradient()
| term_str | aart | aat | abaxis | abbott | abbvie | ability | able | absence | absent | absolute |
|---|---|---|---|---|---|---|---|---|---|---|
| term_str | ||||||||||
| aart | 0.000406 | -0.000002 | -0.000002 | -0.000002 | -0.000002 | -0.000001 | -0.000000 | -0.000002 | -0.000002 | 0.000002 |
| aat | -0.000002 | 0.000394 | -0.000002 | -0.000002 | -0.000002 | -0.000001 | -0.000000 | -0.000002 | 0.000000 | -0.000002 |
| abaxis | -0.000002 | -0.000002 | 0.000643 | -0.000002 | -0.000002 | -0.000002 | -0.000001 | -0.000002 | -0.000002 | -0.000000 |
| abbott | -0.000002 | -0.000002 | -0.000002 | 0.000359 | 0.000011 | -0.000001 | 0.000002 | -0.000002 | -0.000002 | 0.000002 |
| abbvie | -0.000002 | -0.000002 | -0.000002 | 0.000011 | 0.000407 | 0.000000 | 0.000001 | -0.000002 | -0.000002 | -0.000002 |
from scipy.linalg import eigh as eig
%time eig_vals, eig_vecs = eig(COV)
CPU times: user 54.8 s, sys: 12.7 s, total: 1min 7s Wall time: 12.1 s
TERM_IDX = COV.index # We could use other tables as well, e.g. TFIDF_b, TFIDF_c, or COV_c
EIG_VEC = pd.DataFrame(eig_vecs, index=TERM_IDX, columns=TERM_IDX)
EIG_VAL = pd.DataFrame(eig_vals, index=TERM_IDX, columns=['eig_val'])
EIG_VAL.index.name = 'term_id'
EIG_PAIRS = EIG_VAL.join(EIG_VEC.T)
EIG_PAIRS['exp_var'] = np.round((EIG_PAIRS.eig_val / EIG_PAIRS.eig_val.sum()) * 100, 2)
EIG_PAIRS.sort_values('exp_var', ascending = False).head()
| eig_val | aart | aat | abaxis | abbott | abbvie | ability | able | absence | absent | ... | youd | youll | young | younger | youtube | z | zero | zoetis | zone | exp_var | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_id | |||||||||||||||||||||
| zone | 0.018534 | -0.004843 | -0.009964 | -0.004067 | -0.006506 | -0.010236 | 0.001493 | 0.000116 | 0.005359 | 0.004168 | ... | 0.002154 | 0.004572 | -0.001321 | 0.000203 | -0.010461 | 0.001151 | 0.004615 | -0.004485 | 0.000858 | 1.94 |
| zoetis | 0.015160 | -0.007996 | 0.028827 | 0.004241 | 0.013800 | 0.027076 | -0.000570 | -0.000375 | -0.000368 | -0.002864 | ... | -0.000627 | -0.000625 | 0.007144 | 0.004399 | -0.023418 | -0.000311 | -0.007147 | 0.005371 | 0.000521 | 1.59 |
| zero | 0.012370 | 0.021532 | 0.002655 | 0.001581 | -0.001725 | 0.001468 | -0.001804 | -0.001206 | -0.005180 | -0.002745 | ... | -0.004757 | -0.002555 | -0.001026 | -0.001710 | 0.078710 | 0.005835 | -0.013351 | 0.001100 | -0.005017 | 1.30 |
| z | 0.011795 | 0.008239 | 0.004289 | -0.000668 | -0.000546 | 0.002849 | -0.001277 | -0.000168 | 0.005588 | -0.000607 | ... | -0.000060 | 0.001363 | 0.000155 | -0.005909 | -0.017000 | -0.000895 | 0.010530 | -0.001836 | 0.003760 | 1.24 |
| youtube | 0.010778 | -0.027836 | 0.005077 | -0.001693 | -0.001644 | 0.004274 | 0.001075 | 0.001508 | 0.000147 | 0.002952 | ... | 0.001458 | -0.001057 | 0.002049 | 0.001321 | 0.003566 | -0.005238 | -0.000586 | -0.001748 | 0.001261 | 1.13 |
5 rows × 5002 columns
EIG_PAIRS.exp_var.sort_values(ascending=False).head().plot.bar(rot=45)
<AxesSubplot:xlabel='term_id'>
TOPS = EIG_PAIRS.sort_values('exp_var', ascending=False).head(10).reset_index(drop=True)
TOPS.index.name = 'comp_id'
TOPS.index = ["PC{}".format(i) for i in TOPS.index.tolist()] # prefix with PC
LOADINGS = TOPS[TERM_IDX].T
LOADINGS.index.name = 'term_str'
LOADINGS = LOADINGS.join(VOCAB.set_index('term_str'), on = 'term_str')
LOADINGS.head().style.background_gradient()
| PC0 | PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | term_id | n | pos_tup | num | stop | p_stem | max_pos | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | |||||||||||||||||
| aart | -0.004843 | -0.007996 | 0.021532 | 0.008239 | -0.027836 | 0.002173 | 0.002134 | 0.004588 | -0.001898 | -0.009341 | 3704 | 92 | ('aart', 'NN') | 0 | 0 | aart | NN |
| aat | -0.009964 | 0.028827 | 0.002655 | 0.004289 | 0.005077 | -0.009392 | 0.017114 | -0.008820 | 0.003839 | 0.005464 | 3708 | 204 | ('aat', 'NN') | 0 | 0 | aat | NN |
| abaxis | -0.004067 | 0.004241 | 0.001581 | -0.000668 | -0.001693 | -0.004330 | 0.001257 | -0.003987 | 0.002890 | 0.003822 | 3732 | 338 | ('abaxis', 'NN') | 0 | 0 | abaxi | NN |
| abbott | -0.006506 | 0.013800 | -0.001725 | -0.000546 | -0.001644 | 0.006693 | 0.000059 | -0.001638 | 0.002729 | 0.000049 | 3738 | 128 | ('abbott', 'NN') | 0 | 0 | abbott | NN |
| abbvie | -0.010236 | 0.027076 | 0.001468 | 0.002849 | 0.004274 | -0.006120 | 0.015599 | -0.007714 | 0.003250 | 0.003475 | 3745 | 200 | ('abbvie', 'NN') | 0 | 0 | abbvi | NN |
VOCAB
| term_id | term_str | n | pos_tup | num | stop | p_stem | max_pos | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 297 | ('0', 'CD') | 1 | 0 | 0 | CD |
| 1 | 1 | 00 | 7 | ('00', 'CD') | 1 | 0 | 00 | CD |
| 2 | 2 | 000 | 7099 | ('000', 'CD') | 1 | 0 | 000 | CD |
| 3 | 3 | 00000 | 6 | ('00000', 'CD') | 1 | 0 | 00000 | CD |
| 4 | 4 | 00001 | 3 | ('00001', 'CD') | 1 | 0 | 00001 | CD |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 53139 | 53139 | zürich | 4 | ('zürich', 'NN') | 0 | 0 | zürich | NN |
| 53140 | 53140 | ½ | 2 | ('½', 'NN') | 0 | 0 | ½ | NN |
| 53141 | 53141 | Ã | 28 | ('Ã ', 'NN') | 0 | 0 | Ã | NN |
| 53142 | 53142 | élysées | 1 | ('élysées', 'NN') | 0 | 0 | élysé | NN |
| 53143 | 53143 | éléonore | 74 | ('éléonore', 'NN') | 0 | 0 | éléonor | NN |
53144 rows × 8 columns
Show loadings:
lb0_neg = LOADINGS.sort_values('PC0', ascending=True).head(10).index.str.cat(sep=' ')
lb0_pos = LOADINGS.sort_values('PC0', ascending=False).head(10).index.str.cat(sep=' ')
lb1_neg = LOADINGS.sort_values('PC1', ascending=True).head(10).index.str.cat(sep=' ')
lb1_pos = LOADINGS.sort_values('PC1', ascending=False).head(10).index.str.cat(sep=' ')
lb2_neg = LOADINGS.sort_values('PC2', ascending=True).head(10).index.str.cat(sep=' ')
lb2_pos = LOADINGS.sort_values('PC2', ascending=False).head(10).index.str.cat(sep=' ')
print('PC0+', lb0_pos)
print('PC0-', lb0_neg)
print('PC1+', lb1_pos)
print('PC1-', lb1_neg)
print('PC2+', lb2_pos)
print('PC2-', lb2_neg)
PC0+ loan loans deposits banking deposit clients card mortgage client nii PC0- patients clinical patient study disease trial therapy cancer treatment fda PC1+ patients clinical patient study disease treatment trial cancer therapy medicare PC1- stores industrial store intermodal wireless cloud mobile automotive gas jim PC2+ cloud wireless mobile churn nand tower fiber broadband dram video PC2- intermodal gas stores store energy coal oampm renewables oil industrial
Words associated with top PCs are very clear: We see medical terms, financial terms, and tech and infrastructure/energy. See plots below and report for more.
DCM = TFIDF_red.dot(TOPS[TERM_IDX].T)
DCM
| PC0 | PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | ||
|---|---|---|---|---|---|---|---|---|---|---|---|
| quarter | ticker | ||||||||||
| q1-2018 | ADBE | -0.000749 | -0.056328 | 0.127856 | -0.014374 | -0.051245 | -0.045243 | -0.073721 | -0.012429 | 0.063118 | -0.053463 |
| BK | 0.397310 | 0.085827 | -0.002396 | 0.016729 | -0.009370 | 0.049706 | -0.027101 | -0.006393 | 0.033048 | 0.093348 | |
| CMCSA | -0.000779 | -0.126082 | 0.193894 | -0.012224 | 0.278541 | 0.095238 | 0.011851 | 0.034764 | -0.023591 | -0.194990 | |
| DD | 0.001947 | -0.074045 | -0.119808 | 0.032204 | -0.019679 | 0.148421 | 0.004766 | 0.010783 | 0.036979 | 0.020233 | |
| GOOGL | -0.007706 | -0.091772 | 0.205316 | -0.040242 | 0.004412 | -0.148931 | -0.082698 | -0.015589 | 0.079129 | -0.269192 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| q4-2021 | NKE | -0.003396 | -0.038770 | -0.012512 | -0.128280 | -0.016635 | -0.025500 | -0.013070 | -0.025931 | -0.015358 | -0.034386 |
| NVDA | -0.022008 | -0.083212 | 0.169108 | 0.027764 | -0.145245 | -0.028739 | -0.013262 | 0.014953 | 0.015292 | -0.093569 | |
| ORCL | -0.006769 | -0.037956 | 0.136381 | 0.014817 | -0.114823 | -0.051035 | -0.093224 | -0.021338 | 0.089162 | -0.028869 | |
| STZ | -0.005079 | -0.043999 | -0.029159 | -0.086279 | -0.026492 | -0.012303 | 0.001349 | -0.030165 | 0.005581 | 0.011909 | |
| TJX | -0.014948 | -0.119666 | -0.185200 | -0.383189 | 0.042112 | -0.035545 | 0.072008 | -0.028837 | -0.054657 | 0.077778 |
1841 rows × 10 columns
LIB = pd.read_csv('../proj_data/output_data/LIB.csv').drop(columns = ['Unnamed: 0', 'file', 'url', 'quarter'], axis = 1)\
.drop_duplicates().set_index('ticker')
LIB
| co_clean | sector | sub_sector | hq | date_added | founded | |
|---|---|---|---|---|---|---|
| ticker | ||||||
| AMT | American Tower | Real Estate | Specialized REITs | Boston, Massachusetts | 11/19/07 | 1995 |
| RTX | Raytheon Technologies | Industrials | Aerospace & Defense | Waltham, Massachusetts | NaN | 1922 |
| IBM | IBM | Information Technology | IT Consulting & Other Services | Armonk, New York | 3/4/57 | 1911 |
| AMAT | Applied Materials | Information Technology | Semiconductor Equipment | Santa Clara, California | 3/16/95 | 1967 |
| TGT | Target Corporation | Consumer Discretionary | General Merchandise Stores | Minneapolis, Minnesota | 12/31/76 | 1902 |
| ... | ... | ... | ... | ... | ... | ... |
| AAPL | Apple | Information Technology | Technology Hardware, Storage & Peripherals | Cupertino, California | 11/30/82 | 1977 |
| GOOGL | Alphabet (Class A) | Communication Services | Interactive Media & Services | Mountain View, California | 4/3/14 | 1998 |
| AMZN | Amazon | Consumer Discretionary | Internet & Direct Marketing Retail | Seattle, Washington | 11/18/05 | 1994 |
| TSLA | Tesla | Consumer Discretionary | Automobile Manufacturers | Palo Alto, California | 12/21/20 | 2003 |
| NVDA | Nvidia | Information Technology | Semiconductors | Santa Clara, California | 11/30/01 | 1993 |
196 rows × 6 columns
LIB.index
Index(['AMT', 'RTX', 'IBM', 'AMAT', 'TGT', 'BLK', 'GS', 'CAT', 'ISRG', 'AMD',
...
'PFE', 'NFLX', 'ABT', 'FB', 'MSFT', 'AAPL', 'GOOGL', 'AMZN', 'TSLA',
'NVDA'],
dtype='object', name='ticker', length=196)
DCM = DCM.join(LIB, how='outer') # join LIB to DCM
DCM.index.get_level_values(1)
Index(['ADBE', 'BK', 'CMCSA', 'DD', 'GOOGL', 'INFO', 'MS', 'A', 'AAPL', 'ABBV',
...
'CRM', 'EA', 'FDX', 'GIS', 'MDT', 'NKE', 'NVDA', 'ORCL', 'STZ', 'TJX'],
dtype='object', name='ticker', length=1841)
DCM
| PC0 | PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | co_clean | sector | sub_sector | hq | date_added | founded | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| quarter | ticker | ||||||||||||||||
| q1-2018 | ADBE | -0.000749 | -0.056328 | 0.127856 | -0.014374 | -0.051245 | -0.045243 | -0.073721 | -0.012429 | 0.063118 | -0.053463 | Adobe | Information Technology | Application Software | San Jose, California | 5/5/97 | 1982 |
| BK | 0.397310 | 0.085827 | -0.002396 | 0.016729 | -0.009370 | 0.049706 | -0.027101 | -0.006393 | 0.033048 | 0.093348 | BNY Mellon | Financials | Asset Management & Custody Banks | New York City | 3/31/95 | 1784 | |
| CMCSA | -0.000779 | -0.126082 | 0.193894 | -0.012224 | 0.278541 | 0.095238 | 0.011851 | 0.034764 | -0.023591 | -0.194990 | Comcast | Communication Services | Cable & Satellite | Philadelphia, Pennsylvania | 11/19/02 | 1963 | |
| DD | 0.001947 | -0.074045 | -0.119808 | 0.032204 | -0.019679 | 0.148421 | 0.004766 | 0.010783 | 0.036979 | 0.020233 | DuPont | Materials | Specialty Chemicals | Wilmington, Delaware | 4/2/19 | 2017 | |
| GOOGL | -0.007706 | -0.091772 | 0.205316 | -0.040242 | 0.004412 | -0.148931 | -0.082698 | -0.015589 | 0.079129 | -0.269192 | Alphabet (Class A) | Communication Services | Interactive Media & Services | Mountain View, California | 4/3/14 | 1998 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| q4-2021 | NKE | -0.003396 | -0.038770 | -0.012512 | -0.128280 | -0.016635 | -0.025500 | -0.013070 | -0.025931 | -0.015358 | -0.034386 | Nike | Consumer Discretionary | Apparel, Accessories & Luxury Goods | Washington County, Oregon | 11/30/88 | 1964 |
| NVDA | -0.022008 | -0.083212 | 0.169108 | 0.027764 | -0.145245 | -0.028739 | -0.013262 | 0.014953 | 0.015292 | -0.093569 | Nvidia | Information Technology | Semiconductors | Santa Clara, California | 11/30/01 | 1993 | |
| ORCL | -0.006769 | -0.037956 | 0.136381 | 0.014817 | -0.114823 | -0.051035 | -0.093224 | -0.021338 | 0.089162 | -0.028869 | Oracle | Information Technology | Application Software | Austin, Texas | 8/31/89 | 1977 | |
| STZ | -0.005079 | -0.043999 | -0.029159 | -0.086279 | -0.026492 | -0.012303 | 0.001349 | -0.030165 | 0.005581 | 0.011909 | Constellation Brands | Consumer Staples | Distillers & Vintners | Victor, New York | 7/1/05 | 1945 | |
| TJX | -0.014948 | -0.119666 | -0.185200 | -0.383189 | 0.042112 | -0.035545 | 0.072008 | -0.028837 | -0.054657 | 0.077778 | TJX Companies | Consumer Discretionary | Apparel Retail | Framingham, Massachusetts | 9/30/85 | 1987 |
1841 rows × 16 columns
DCM.reset_index(inplace=True)
def vis_pcs(M, a, b, color = 'quarter', hover = 'ticker', prefix='PC'):
fig = px.scatter(M, prefix + str(a), prefix + str(b),
color= color,
hover_name= hover, marginal_x='box', marginal_y='box', height=800)
fig.show()
vis_pcs(DCM, 0, 1, color = 'sector')
vis_pcs(DCM, 1, 2, color = 'sector')
View loadings:
def vis_loadings(M, a, b, hover = 'term_str', prefix='PC', color = 'max_pos'):
fig = px.scatter(M, prefix + str(a), prefix + str(b),
color = color,
hover_name= hover, marginal_x='box', marginal_y='box', height=800)
fig.show()
vis_loadings(LOADINGS.reset_index(), 0, 1)
vis_loadings(LOADINGS.reset_index(), 1, 2)
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
wordcloud2 = WordCloud().generate(' '.join(TOKEN['term_str']))
plt.imshow(wordcloud2)
plt.axis("off")
plt.show()
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
%matplotlib inline
n_terms = 4000
n_topics = 30
max_iter = 5
The transition CSV contains a DF with transitions between speakers in each conference call. This is sort of like a paragraph change in a book, so I will be thinking about it this way.
PARAS = pd.read_csv('../proj_data/output_data/transition_call.csv').set_index(OHCO[:3]+['co_count', 'qa'])
Adding the index including Q&A flag:
CALLS_qa = pd.read_csv('../proj_data/output_data/transition_call.csv').set_index(['ticker', 'quarter', 'qa']).drop(columns = ['speaker', 'co_count'])
CALLS_qa
| transcript | |||
|---|---|---|---|
| ticker | quarter | qa | |
| A | q1-2021 | pres | Good afternoon and welcome to the Agilent Tech... |
| pres | They just revealed what they believe are the t... | ||
| q1-2019 | pres | Thank you. And welcome, everyone, to Agilent's... | |
| pres | You can find the press release, investor prese... | ||
| pres | They just revealed what they believe are the t... | ||
| ... | ... | ... | ... |
| ZTS | q2-2019 | qa | Hi. Thanks for taking my questions. This is Th... |
| qa | Second question is, if you could comment on ot... | ||
| q1-2021 | pres | Welcome to the First Quarter 2021 Financial Re... | |
| pres | It is now my pleasure to turn the floor over t... | ||
| pres | They just revealed what they believe are the t... |
276687 rows × 1 columns
PARAS.head()
| transcript | |||||
|---|---|---|---|---|---|
| ticker | speaker | quarter | co_count | qa | |
| A | Andrew Obin | q1-2021 | 2 | pres | Good afternoon and welcome to the Agilent Tech... |
| 3 | pres | They just revealed what they believe are the t... | |||
| Ankur Dhingra | q1-2019 | 4 | pres | Thank you. And welcome, everyone, to Agilent's... | |
| 5 | pres | You can find the press release, investor prese... | |||
| 6 | pres | They just revealed what they believe are the t... |
tfv = CountVectorizer(max_features=n_terms, stop_words='english')
tf = tfv.fit_transform(PARAS.transcript)
TERMS = tfv.get_feature_names()
lda = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=50., random_state=0)
Create Phi and Theta tables.
THETA = pd.DataFrame(lda.fit_transform(tf), index=PARAS.index)
THETA.columns.name = 'topic_id'
THETA.sample(10).style.background_gradient()
| topic_id | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | ||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ticker | speaker | quarter | co_count | qa | ||||||||||||||||||||||||||||||
| DD | David Begleiter | q1-2020 | 150 | qa | 0.004762 | 0.004762 | 0.004762 | 0.004762 | 0.004762 | 0.004762 | 0.004762 | 0.004762 | 0.004762 | 0.004762 | 0.249895 | 0.004762 | 0.004762 | 0.004762 | 0.004762 | 0.004762 | 0.004762 | 0.004762 | 0.004762 | 0.004762 | 0.004762 | 0.004762 | 0.004762 | 0.004762 | 0.004762 | 0.004762 | 0.616772 | 0.004762 | 0.004762 | 0.004762 |
| MMM | Nicholas C. Gangestad | q1-2020 | 151 | qa | 0.050576 | 0.000490 | 0.000490 | 0.000490 | 0.076827 | 0.000490 | 0.041368 | 0.000490 | 0.000490 | 0.000490 | 0.000490 | 0.000490 | 0.000490 | 0.082726 | 0.000490 | 0.000490 | 0.000490 | 0.000490 | 0.076442 | 0.000490 | 0.000490 | 0.087274 | 0.000490 | 0.177657 | 0.000490 | 0.000490 | 0.000490 | 0.396346 | 0.000490 | 0.000490 |
| PYPL | John Rainey | q2-2019 | 120 | qa | 0.001389 | 0.001389 | 0.001389 | 0.001389 | 0.001389 | 0.001389 | 0.001389 | 0.001389 | 0.061935 | 0.073400 | 0.001389 | 0.001389 | 0.001389 | 0.001389 | 0.001389 | 0.001389 | 0.001389 | 0.001389 | 0.001389 | 0.001389 | 0.001389 | 0.001389 | 0.001389 | 0.113409 | 0.001389 | 0.001389 | 0.036913 | 0.679621 | 0.001389 | 0.001389 |
| MSFT | Satya Nadella | q1-2021 | 30 | pres | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.501830 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.119005 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.359600 | 0.000725 | 0.000725 | 0.000725 | 0.000725 |
| WM | Jim Fish | q4-2020 | 329 | qa | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.340338 | 0.011111 | 0.011111 | 0.011111 | 0.348551 | 0.011111 |
| ATVI | Bobby Kotick | q3-2020 | 126 | qa | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.876107 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.072041 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 |
| ZTS | Juan Ramon Alaix | q4-2018 | 163 | qa | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.011111 | 0.677778 |
| JNJ | Alex Gorsky | q1-2020 | 24 | pres | 0.002083 | 0.002083 | 0.002083 | 0.002083 | 0.002083 | 0.002083 | 0.703036 | 0.002083 | 0.002083 | 0.002083 | 0.002083 | 0.002083 | 0.002083 | 0.002083 | 0.002083 | 0.002083 | 0.002083 | 0.002083 | 0.002083 | 0.002083 | 0.002083 | 0.130059 | 0.002083 | 0.110655 | 0.002083 | 0.002083 | 0.002083 | 0.002083 | 0.002083 | 0.002083 |
| EQIX | Nick Del Deo | q2-2019 | 219 | qa | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.006667 | 0.806667 | 0.006667 | 0.006667 | 0.006667 |
| JPM | Jamie Dimon | q1-2021 | 121 | qa | 0.002564 | 0.167684 | 0.316654 | 0.002564 | 0.002564 | 0.002564 | 0.002564 | 0.002564 | 0.002564 | 0.002564 | 0.002564 | 0.002564 | 0.002564 | 0.002564 | 0.002564 | 0.083034 | 0.002564 | 0.002564 | 0.002564 | 0.002564 | 0.002564 | 0.002564 | 0.002564 | 0.002564 | 0.002564 | 0.002564 | 0.365962 | 0.002564 | 0.002564 | 0.002564 |
PHI = pd.DataFrame(lda.components_, columns=TERMS)
PHI.index.name = 'topic_id'
PHI.columns.name = 'term_str'
PHI.T.head().style.background_gradient()
| topic_id | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | ||||||||||||||||||||||||||||||
| 00 | 178.275065 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.325185 | 18.360478 | 0.036282 | 0.409775 | 0.033333 | 0.121858 | 2.332798 | 0.033333 | 0.033333 | 11.391410 | 0.033333 | 0.033333 | 0.033337 | 0.033333 | 12.869841 | 7.073807 | 0.033333 | 60.810730 | 0.033333 | 1.335269 | 1.636279 | 28.485067 | 0.036152 | 0.033333 |
| 000 | 0.038939 | 26.020049 | 7.048408 | 0.698534 | 0.499340 | 12.908294 | 651.668073 | 157.185691 | 22.312881 | 112.275871 | 27.075890 | 25.318790 | 330.159397 | 10.659179 | 0.812952 | 0.033333 | 7.975744 | 1.147291 | 0.777610 | 0.134347 | 2.718398 | 30.931850 | 24.899615 | 559.181185 | 0.515851 | 5087.740971 | 1.377465 | 2.137726 | 4.687015 | 0.059310 |
| 01 | 0.033333 | 0.033333 | 0.465936 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 16.247922 | 0.033333 | 0.058720 | 0.033333 | 0.035685 | 7.919690 | 0.087139 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 22.252673 | 259.551667 | 0.033333 | 99.624544 | 0.033333 | 0.039304 | 0.083385 | 0.033333 | 0.033333 | 0.033333 |
| 02 | 0.033333 | 0.033333 | 0.119509 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 16.300904 | 0.033333 | 0.033333 | 0.033333 | 0.973674 | 1.233954 | 0.042101 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 10.111357 | 374.235596 | 0.170386 | 119.883773 | 0.198441 | 0.130223 | 0.198140 | 3.835274 | 0.033333 | 0.033333 |
| 03 | 0.033333 | 0.033333 | 0.088683 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 27.205553 | 0.033333 | 0.033333 | 0.061401 | 2.072222 | 8.993617 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 18.818768 | 327.533013 | 4.265658 | 94.380151 | 0.036211 | 0.033333 | 0.035774 | 0.908948 | 0.033333 | 0.033333 |
Get topics:
TOPICS = PHI.stack().to_frame().rename(columns={0:'weight'})\
.groupby('topic_id')\
.apply(lambda x:
x.weight.sort_values(ascending=False)\
.head(10)\
.reset_index()\
.drop('topic_id',1)\
.term_str)
TOPICS[:5]
| term_str | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
|---|---|---|---|---|---|---|---|---|---|---|
| topic_id | ||||||||||
| 0 | gaap | non | financial | today | release | measures | earnings | website | available | thank |
| 1 | really | new | people | ve | just | sure | team | supply | make | work |
| 2 | going | know | ve | don | think | really | right | ll | say | yes |
| 3 | thank | great | continue | ve | got | going | business | ll | strategy | cost |
| 4 | markets | market | single | category | care | growth | home | digits | double | digit |
TOPICS['label'] = TOPICS.apply(lambda x: str(x.name) + ' ' + ' '.join(x), 1)
## Sort Topics by Doc Weight
TOPICS['doc_weight_sum'] = THETA.sum()
TOPICS.sort_values('doc_weight_sum', ascending=True).plot.barh(y='doc_weight_sum', x='label', figsize=(5,10))
<AxesSubplot:ylabel='label'>
THETA
| topic_id | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | ||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ticker | speaker | quarter | co_count | qa | |||||||||||||||||||||
| A | Andrew Obin | q1-2021 | 2 | pres | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | ... | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 |
| 3 | pres | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | ... | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.946296 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | |||
| Ankur Dhingra | q1-2019 | 4 | pres | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.041664 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | ... | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | |
| 5 | pres | 0.973874 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | ... | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | |||
| 6 | pres | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | ... | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.946296 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | 0.001852 | |||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| ZTS | Thomas Chiu | q2-2019 | 157 | qa | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.238217 | 0.001010 | 0.001010 | ... | 0.001010 | 0.001010 | 0.088681 | 0.001010 | 0.001010 | 0.001010 | 0.472156 | 0.001010 | 0.001010 | 0.001010 |
| 158 | qa | 0.000952 | 0.000952 | 0.000952 | 0.000952 | 0.151846 | 0.000952 | 0.000952 | 0.000952 | 0.335442 | 0.000952 | ... | 0.000952 | 0.000952 | 0.000952 | 0.000952 | 0.000952 | 0.000952 | 0.000952 | 0.000952 | 0.000952 | 0.125234 | |||
| Vijay Jayant | q1-2021 | 2 | pres | 0.751585 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | ... | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | 0.000725 | |
| 3 | pres | 0.004167 | 0.004167 | 0.004167 | 0.004167 | 0.004167 | 0.004167 | 0.004167 | 0.004167 | 0.004167 | 0.004167 | ... | 0.004167 | 0.004167 | 0.004167 | 0.004167 | 0.004167 | 0.004167 | 0.004167 | 0.004167 | 0.004167 | 0.004167 | |||
| 4 | pres | 0.001961 | 0.001961 | 0.001961 | 0.001961 | 0.001961 | 0.001961 | 0.001961 | 0.001961 | 0.001961 | 0.001961 | ... | 0.001961 | 0.001961 | 0.001961 | 0.001961 | 0.943137 | 0.001961 | 0.001961 | 0.001961 | 0.001961 | 0.001961 |
276687 rows × 30 columns
topic_cols = [t for t in range(n_topics)]
QA = THETA.groupby('qa')[topic_cols].mean().T
QA.index.name = 'topic_id'
QA.T
| topic_id | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| qa | |||||||||||||||||||||
| pres | 0.030697 | 0.016437 | 0.006065 | 0.010178 | 0.014042 | 0.018658 | 0.053985 | 0.106829 | 0.010201 | 0.041267 | ... | 0.029188 | 0.122435 | 0.029510 | 0.069518 | 0.012784 | 0.047032 | 0.013685 | 0.008448 | 0.012330 | 0.020498 |
| qa | 0.008514 | 0.032370 | 0.046671 | 0.041600 | 0.016275 | 0.014557 | 0.017288 | 0.026115 | 0.011239 | 0.025673 | ... | 0.021204 | 0.023640 | 0.017694 | 0.021943 | 0.016743 | 0.021154 | 0.089758 | 0.057189 | 0.023794 | 0.093182 |
2 rows × 30 columns
QA['topterms'] = TOPICS[[i for i in range(10)]].apply(lambda x: ' '.join(x), 1)
QA.sort_values('pres', ascending=False).style.background_gradient()
| qa | pres | qa | topterms |
|---|---|---|---|
| topic_id | |||
| 21 | 0.122435 | 0.023640 | year quarter million basis margin operating points revenue adjusted billion |
| 7 | 0.106829 | 0.026115 | year growth quarter revenue strong sales business grew driven expect |
| 19 | 0.103344 | 0.023755 | turn ll operator conference quarter thank questions instructions like today |
| 16 | 0.088368 | 0.020773 | growth continue business term strong long market new portfolio innovation |
| 23 | 0.069518 | 0.021943 | quarter covid year impact guidance 19 expect 2020 second sales |
| 12 | 0.054827 | 0.013133 | billion million capital debt quarter balance year sheet continue cash |
| 6 | 0.053985 | 0.017288 | new customers patients solutions care digital market continue patient including |
| 25 | 0.047032 | 0.021154 | customers 000 ve stores work new employees day people team |
| 9 | 0.041267 | 0.025673 | cloud customers data platform customer new value digital experience people |
| 0 | 0.030697 | 0.008514 | gaap non financial today release measures earnings website available thank |
| 22 | 0.029510 | 0.017694 | patients data phase chief officer study executive clinical disease trial |
| 20 | 0.029188 | 0.021204 | cash flow billion free year capital quarter share duration minutes |
| 15 | 0.027587 | 0.008024 | forward looking statements results today actual factors risks materially differ |
| 29 | 0.020498 | 0.093182 | question line comes open ahead thank bank ll morgan proceed |
| 5 | 0.018658 | 0.014557 | credit capital asset project investment management risk equity data esg |
| 1 | 0.016437 | 0.032370 | really new people ve just sure team supply make work |
| 4 | 0.014042 | 0.016275 | markets market single category care growth home digits double digit |
| 26 | 0.013685 | 0.089758 | thanks good morning thank guys okay great hi hey question |
| 24 | 0.012784 | 0.016743 | right better best stocks 10 believe buy investors just think |
| 28 | 0.012330 | 0.023794 | ve china question think yes company did way want know |
| 18 | 0.011902 | 0.053457 | business growth think seeing market really ve bit little just |
| 8 | 0.010201 | 0.011239 | content new copper ve production mining development years foundry grasberg |
| 3 | 0.010178 | 0.041600 | thank great continue ve got going business ll strategy cost |
| 13 | 0.009308 | 0.046392 | think going term long time ll things just really ve |
| 14 | 0.008650 | 0.024776 | remarks closing ok don prepared think time capital operator way |
| 17 | 0.008638 | 0.048910 | think customers really business going look opportunity market lot products |
| 10 | 0.008508 | 0.037637 | think year going cost just costs capital capex spend ve |
| 27 | 0.008448 | 0.057189 | ve year think look good ll really seen just pricing |
| 2 | 0.006065 | 0.046671 | going know ve don think really right ll say yes |
| 11 | 0.005083 | 0.096542 | just maybe kind think question little bit year thanks like |
QA.sort_values('qa', ascending=False).style.background_gradient()
| qa | pres | qa | topterms |
|---|---|---|---|
| topic_id | |||
| 11 | 0.005083 | 0.096542 | just maybe kind think question little bit year thanks like |
| 29 | 0.020498 | 0.093182 | question line comes open ahead thank bank ll morgan proceed |
| 26 | 0.013685 | 0.089758 | thanks good morning thank guys okay great hi hey question |
| 27 | 0.008448 | 0.057189 | ve year think look good ll really seen just pricing |
| 18 | 0.011902 | 0.053457 | business growth think seeing market really ve bit little just |
| 17 | 0.008638 | 0.048910 | think customers really business going look opportunity market lot products |
| 2 | 0.006065 | 0.046671 | going know ve don think really right ll say yes |
| 13 | 0.009308 | 0.046392 | think going term long time ll things just really ve |
| 3 | 0.010178 | 0.041600 | thank great continue ve got going business ll strategy cost |
| 10 | 0.008508 | 0.037637 | think year going cost just costs capital capex spend ve |
| 1 | 0.016437 | 0.032370 | really new people ve just sure team supply make work |
| 7 | 0.106829 | 0.026115 | year growth quarter revenue strong sales business grew driven expect |
| 9 | 0.041267 | 0.025673 | cloud customers data platform customer new value digital experience people |
| 14 | 0.008650 | 0.024776 | remarks closing ok don prepared think time capital operator way |
| 28 | 0.012330 | 0.023794 | ve china question think yes company did way want know |
| 19 | 0.103344 | 0.023755 | turn ll operator conference quarter thank questions instructions like today |
| 21 | 0.122435 | 0.023640 | year quarter million basis margin operating points revenue adjusted billion |
| 23 | 0.069518 | 0.021943 | quarter covid year impact guidance 19 expect 2020 second sales |
| 20 | 0.029188 | 0.021204 | cash flow billion free year capital quarter share duration minutes |
| 25 | 0.047032 | 0.021154 | customers 000 ve stores work new employees day people team |
| 16 | 0.088368 | 0.020773 | growth continue business term strong long market new portfolio innovation |
| 22 | 0.029510 | 0.017694 | patients data phase chief officer study executive clinical disease trial |
| 6 | 0.053985 | 0.017288 | new customers patients solutions care digital market continue patient including |
| 24 | 0.012784 | 0.016743 | right better best stocks 10 believe buy investors just think |
| 4 | 0.014042 | 0.016275 | markets market single category care growth home digits double digit |
| 5 | 0.018658 | 0.014557 | credit capital asset project investment management risk equity data esg |
| 12 | 0.054827 | 0.013133 | billion million capital debt quarter balance year sheet continue cash |
| 8 | 0.010201 | 0.011239 | content new copper ve production mining development years foundry grasberg |
| 0 | 0.030697 | 0.008514 | gaap non financial today release measures earnings website available thank |
| 15 | 0.027587 | 0.008024 | forward looking statements results today actual factors risks materially differ |
import plotly_express as px
px.scatter(QA.reset_index(), 'qa', 'pres', hover_name='topterms', text='topic_id', height = 800)\
.update_traces(mode='text')
See report for more details on these graphs
mcd = pd.read_csv('LoughranMcDonald_MasterDictionary_2020.csv')
mcd.head()
| Word | Seq_num | Word Count | Word Proportion | Average Proportion | Std Dev | Doc Count | Negative | Positive | Uncertainty | Litigious | Strong_Modal | Weak_Modal | Constraining | Complexity | Syllables | Source | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AARDVARK | 1 | 312 | 1.422050e-08 | 1.335201e-08 | 3.700747e-06 | 96 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 12of12inf |
| 1 | AARDVARKS | 2 | 3 | 1.367356e-10 | 8.882163e-12 | 9.362849e-09 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 12of12inf |
| 2 | ABACI | 3 | 9 | 4.102067e-10 | 1.200533e-10 | 5.359747e-08 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 12of12inf |
| 3 | ABACK | 4 | 15 | 6.836779e-10 | 4.080549e-10 | 1.406914e-07 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 12of12inf |
| 4 | ABACUS | 5 | 8009 | 3.650384e-07 | 3.798698e-07 | 3.523914e-05 | 1058 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 12of12inf |
mcd['Word'] = mcd['Word'].str.lower()
mcd.head(1)
| Word | Seq_num | Word Count | Word Proportion | Average Proportion | Std Dev | Doc Count | Negative | Positive | Uncertainty | Litigious | Strong_Modal | Weak_Modal | Constraining | Complexity | Syllables | Source | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | aardvark | 1 | 312 | 1.422050e-08 | 1.335201e-08 | 0.000004 | 96 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 12of12inf |
from IPython.core.display import display, HTML
import seaborn as sns
sns.set()
mcd.columns
Index(['Word', 'Seq_num', 'Word Count', 'Word Proportion',
'Average Proportion', 'Std Dev', 'Doc Count', 'Negative', 'Positive',
'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal',
'Constraining', 'Complexity', 'Syllables', 'Source'],
dtype='object')
mcd.rename(columns = {'Word':'term_str'}, inplace = True)
mcd.Positive = np.where(mcd.Positive != 0, 1, 0)
mcd.Negative = np.where(mcd.Negative != 0, -1, 0)
m_clean = [i for i in mcd.columns if i not in ['term_str', 'Seq_num', 'Word Count', 'Word Proportion', 'Average Proportion', 'Std Dev', 'Doc Count', 'Negative', 'Positive', 'Syllables', 'Source']]
m_clean
['Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal', 'Constraining', 'Complexity']
mcd[m_clean] = np.where(mcd[m_clean] != 0, 1, 0)
mcd['polarity'] = mcd.Positive - mcd.Negative
mcd
| term_str | Seq_num | Word Count | Word Proportion | Average Proportion | Std Dev | Doc Count | Negative | Positive | Uncertainty | Litigious | Strong_Modal | Weak_Modal | Constraining | Complexity | Syllables | Source | polarity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | aardvark | 1 | 312 | 1.422050e-08 | 1.335201e-08 | 3.700747e-06 | 96 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 12of12inf | 0 |
| 1 | aardvarks | 2 | 3 | 1.367356e-10 | 8.882163e-12 | 9.362849e-09 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 12of12inf | 0 |
| 2 | abaci | 3 | 9 | 4.102067e-10 | 1.200533e-10 | 5.359747e-08 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 12of12inf | 0 |
| 3 | aback | 4 | 15 | 6.836779e-10 | 4.080549e-10 | 1.406914e-07 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 12of12inf | 0 |
| 4 | abacus | 5 | 8009 | 3.650384e-07 | 3.798698e-07 | 3.523914e-05 | 1058 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 12of12inf | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 86526 | zygote | 86529 | 48 | 2.187769e-09 | 8.817180e-10 | 1.907714e-07 | 33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 12of12inf | 0 |
| 86527 | zygotes | 86530 | 1 | 4.557853e-11 | 1.857263e-11 | 1.957775e-08 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 12of12inf | 0 |
| 86528 | zygotic | 86531 | 0 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 12of12inf | 0 |
| 86529 | zymurgies | 86532 | 0 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 12of12inf | 0 |
| 86530 | zymurgy | 86533 | 0 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 12of12inf | 0 |
86531 rows × 18 columns
TOKEN.head()
| Unnamed: 5 | token_str | term_str | pos_tup | pos | num | stop | p_stem | max_pos | |||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ticker | speaker | quarter | qa | co_count | |||||||||
| A | Andrew Obin | q1-2021 | pres | 2 | 0 | Good | good | ('good', 'JJ') | JJ | 0 | 0 | good | JJ |
| 2 | 1 | afternoon | afternoon | ('afternoon', 'NN') | NN | 0 | 0 | afternoon | NN | ||||
| 2 | 2 | and | and | ('and', 'CC') | CC | 0 | 1 | and | CC | ||||
| 2 | 3 | welcome | welcome | ('welcome', 'NN') | NN | 0 | 0 | welcom | NN | ||||
| 2 | 4 | to | to | ('to', 'TO') | TO | 0 | 1 | to | TO |
TOKEN = TOKEN.reset_index().merge(mcd, on='term_str', how='left').drop(columns = 'Unnamed: 5')
TOKEN.head()
| ticker | speaker | quarter | qa | co_count | token_str | term_str | pos_tup | pos | num | ... | Positive | Uncertainty | Litigious | Strong_Modal | Weak_Modal | Constraining | Complexity | Syllables | Source | polarity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | A | Andrew Obin | q1-2021 | pres | 2 | Good | good | ('good', 'JJ') | JJ | 0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 12of12inf | 1.0 |
| 1 | A | Andrew Obin | q1-2021 | pres | 2 | afternoon | afternoon | ('afternoon', 'NN') | NN | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 12of12inf | 0.0 |
| 2 | A | Andrew Obin | q1-2021 | pres | 2 | and | and | ('and', 'CC') | CC | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 12of12inf | 0.0 |
| 3 | A | Andrew Obin | q1-2021 | pres | 2 | welcome | welcome | ('welcome', 'NN') | NN | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 12of12inf | 0.0 |
| 4 | A | Andrew Obin | q1-2021 | pres | 2 | to | to | ('to', 'TO') | TO | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 12of12inf | 0.0 |
5 rows × 30 columns
TOKEN.set_index(['ticker', 'speaker', 'quarter', 'qa', 'co_count'], inplace = True)
TOKEN.head()
| token_str | term_str | pos_tup | pos | num | stop | p_stem | max_pos | Seq_num | Word Count | ... | Positive | Uncertainty | Litigious | Strong_Modal | Weak_Modal | Constraining | Complexity | Syllables | Source | polarity | |||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ticker | speaker | quarter | qa | co_count | |||||||||||||||||||||
| A | Andrew Obin | q1-2021 | pres | 2 | Good | good | ('good', 'JJ') | JJ | 0 | 0 | good | JJ | 32116.0 | 3719388.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 12of12inf | 1.0 |
| 2 | afternoon | afternoon | ('afternoon', 'NN') | NN | 0 | 0 | afternoon | NN | 1403.0 | 5440.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 12of12inf | 0.0 | ||||
| 2 | and | and | ('and', 'CC') | CC | 0 | 1 | and | CC | 2491.0 | 828265940.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 12of12inf | 0.0 | ||||
| 2 | welcome | welcome | ('welcome', 'NN') | NN | 0 | 0 | welcom | NN | 84675.0 | 12367.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 12of12inf | 0.0 | ||||
| 2 | to | to | ('to', 'TO') | TO | 0 | 1 | to | TO | 77811.0 | 671191739.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 12of12inf | 0.0 |
5 rows × 25 columns
TOKEN.columns
Index(['token_str', 'term_str', 'pos_tup', 'pos', 'num', 'stop', 'p_stem',
'max_pos', 'Seq_num', 'Word Count', 'Word Proportion',
'Average Proportion', 'Std Dev', 'Doc Count', 'Negative', 'Positive',
'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal',
'Constraining', 'Complexity', 'Syllables', 'Source', 'polarity'],
dtype='object')
TOKEN[m_clean] = TOKEN[m_clean].fillna(0)
qAns = TOKEN[TOKEN.index.get_level_values('qa')=='qa'].copy() # need a copy or else it'll be a shallow copy and affect the tokens table
pres = TOKEN[TOKEN.index.get_level_values('qa')=='pres'].copy()
qAns[m_clean + ['polarity']].mean().sort_values().plot.barh()
<AxesSubplot:>
Interesting that the polarity is higher in Q&A -- maybe because people are generally congratulatory? Complexity is higher.
pres[m_clean + ['polarity']].mean().sort_values().plot.barh()
<AxesSubplot:>
from gensim.models import word2vec
There is so much data, so break out by ticker for now. Add tickers to the val list to run more. Can also adjust to run for sectors.
col = 'ticker'
val = ['AAPL', 'BA', 'F', 'MSFT']
TOKEN_sne = TOKEN.reset_index()
corpus = TOKEN_sne[TOKEN_sne[col].isin(val)]\
.groupby(OHCO)\
.term_str.apply(lambda x: x.tolist())\
.reset_index()['term_str'].tolist()
model = word2vec.Word2Vec(corpus, vector_size=246, window=5, min_count=200, workers=4)
coords = pd.DataFrame(index=range(len(model.wv.key_to_index)))
coords['label'] = model.wv.index_to_key
coords['vector'] = coords['label'].apply(lambda x: model.wv.get_vector(x))
from sklearn.manifold import TSNE
tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
tsne_vals = tsne_model.fit_transform(coords['vector'].tolist())
coords['x'] = tsne_vals[:,0]
coords['y'] = tsne_vals[:,1]
px.scatter(coords, 'x', 'y', text='label', height=1000).update_traces(mode='text')
Prelim analysis:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
Apply polarity scores to each transition in the 'paragraph' data frame.
PARAS_vader_cols = PARAS.transcript.apply(analyser.polarity_scores).apply(lambda x: pd.Series(x))
PARAS_vader = pd.concat([PARAS, PARAS_vader_cols], axis=1)
CALLS=pd.read_csv('../proj_data/output_data/CALL.csv')
Apply the same for the data frame with Q&A flag.
CALLS_vader_cols = CALLS_qa.transcript.apply(analyser.polarity_scores).apply(lambda x: pd.Series(x))
CALLS_vader = pd.concat([CALLS_qa, CALLS_vader_cols], axis=1)
PARAS_vader
| transcript | neg | neu | pos | compound | |||||
|---|---|---|---|---|---|---|---|---|---|
| ticker | speaker | quarter | co_count | qa | |||||
| A | Andrew Obin | q1-2021 | 2 | pres | Good afternoon and welcome to the Agilent Tech... | 0.0 | 0.826 | 0.174 | 0.8689 |
| 3 | pres | They just revealed what they believe are the t... | 0.0 | 0.826 | 0.174 | 0.8122 | |||
| Ankur Dhingra | q1-2019 | 4 | pres | Thank you. And welcome, everyone, to Agilent's... | 0.0 | 0.924 | 0.076 | 0.6705 | |
| 5 | pres | You can find the press release, investor prese... | 0.0 | 1.000 | 0.000 | 0.0000 | |||
| 6 | pres | They just revealed what they believe are the t... | 0.0 | 0.826 | 0.174 | 0.8122 | |||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| ZTS | Thomas Chiu | q2-2019 | 157 | qa | Hi. Thanks for taking my questions. This is Th... | 0.0 | 0.803 | 0.197 | 0.9528 |
| 158 | qa | Second question is, if you could comment on ot... | 0.0 | 0.880 | 0.120 | 0.8796 | |||
| Vijay Jayant | q1-2021 | 2 | pres | Welcome to the First Quarter 2021 Financial Re... | 0.0 | 0.968 | 0.032 | 0.4588 | |
| 3 | pres | It is now my pleasure to turn the floor over t... | 0.0 | 0.812 | 0.188 | 0.5719 | |||
| 4 | pres | They just revealed what they believe are the t... | 0.0 | 0.821 | 0.179 | 0.8122 |
276687 rows × 5 columns
CALLS_vader
| transcript | neg | neu | pos | compound | |||
|---|---|---|---|---|---|---|---|
| ticker | quarter | qa | |||||
| A | q1-2021 | pres | Good afternoon and welcome to the Agilent Tech... | 0.0 | 0.826 | 0.174 | 0.8689 |
| pres | They just revealed what they believe are the t... | 0.0 | 0.826 | 0.174 | 0.8122 | ||
| q1-2019 | pres | Thank you. And welcome, everyone, to Agilent's... | 0.0 | 0.924 | 0.076 | 0.6705 | |
| pres | You can find the press release, investor prese... | 0.0 | 1.000 | 0.000 | 0.0000 | ||
| pres | They just revealed what they believe are the t... | 0.0 | 0.826 | 0.174 | 0.8122 | ||
| ... | ... | ... | ... | ... | ... | ... | ... |
| ZTS | q2-2019 | qa | Hi. Thanks for taking my questions. This is Th... | 0.0 | 0.803 | 0.197 | 0.9528 |
| qa | Second question is, if you could comment on ot... | 0.0 | 0.880 | 0.120 | 0.8796 | ||
| q1-2021 | pres | Welcome to the First Quarter 2021 Financial Re... | 0.0 | 0.968 | 0.032 | 0.4588 | |
| pres | It is now my pleasure to turn the floor over t... | 0.0 | 0.812 | 0.188 | 0.5719 | ||
| pres | They just revealed what they believe are the t... | 0.0 | 0.821 | 0.179 | 0.8122 |
276687 rows × 5 columns
Apple sentiment chart for Q1 2021 (just as an example):
call_test = CALLS_vader.loc['AAPL', 'q1-2021', 'qa'] # Change ticker, quarter, and qa here to see plots for other triplets
w = int(call_test.shape[0] / 5)
call_test[['pos','neg']].rolling(w).mean().plot(figsize=(25,5)) # rolling smooths out the lines
call_test[['neu']].rolling(w).mean().plot(figsize=(25,5))
call_test[['compound']].rolling(w).mean().plot(figsize=(25,5))
<AxesSubplot:xlabel='ticker,quarter,qa'>
df_map = pd.read_csv('../proj_data/output_data/price_map.csv') # bring in the price info, which contains the price as of each call
df_transition = PARAS_vader.merge(df_map, on = ['ticker', 'quarter', 'qa'], how = 'left') # merge with the VADER paragraph DF
df_calls = CALLS_vader.merge(df_map, on = ['ticker', 'quarter', 'qa'], how = 'left') # merge with the VADER calls DF
Sent Analysis run on transition from speaker to speaker. I figure this will be better for more proper sentiment analysis on smaller chunks; then the data are aggregated up to call.
df_transition.head(3) # transition from speaker to speaker. I figure this will be better for more proper sentiment analysis on smaller chunks
| ticker | quarter | qa | transcript | neg | neu | pos | compound | Unnamed: 0 | co_clean | ... | date_81 | date_82 | date_83 | date_84 | date_85 | date_86 | date_87 | date_88 | date_89 | date_90 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | A | q1-2021 | pres | Good afternoon and welcome to the Agilent Tech... | 0.0 | 0.826 | 0.174 | 0.8689 | 154418 | Agilent Technologies | ... | 2021-05-08 | 2021-05-09 | 2021-05-10 | 2021-05-11 | 2021-05-12 | 2021-05-13 | 2021-05-14 | 2021-05-15 | 2021-05-16 | 2021-05-17 |
| 1 | A | q1-2021 | pres | They just revealed what they believe are the t... | 0.0 | 0.826 | 0.174 | 0.8122 | 154418 | Agilent Technologies | ... | 2021-05-08 | 2021-05-09 | 2021-05-10 | 2021-05-11 | 2021-05-12 | 2021-05-13 | 2021-05-14 | 2021-05-15 | 2021-05-16 | 2021-05-17 |
| 2 | A | q1-2019 | pres | Thank you. And welcome, everyone, to Agilent's... | 0.0 | 0.924 | 0.076 | 0.6705 | 153000 | Agilent Technologies | ... | 2019-05-13 | 2019-05-14 | 2019-05-15 | 2019-05-16 | 2019-05-17 | 2019-05-18 | 2019-05-19 | 2019-05-20 | 2019-05-21 | 2019-05-22 |
3 rows × 193 columns
df_calls.head(3)
| ticker | quarter | qa | transcript | neg | neu | pos | compound | Unnamed: 0 | co_clean | ... | date_81 | date_82 | date_83 | date_84 | date_85 | date_86 | date_87 | date_88 | date_89 | date_90 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | A | q1-2021 | pres | Good afternoon and welcome to the Agilent Tech... | 0.0 | 0.826 | 0.174 | 0.8689 | 154418 | Agilent Technologies | ... | 2021-05-08 | 2021-05-09 | 2021-05-10 | 2021-05-11 | 2021-05-12 | 2021-05-13 | 2021-05-14 | 2021-05-15 | 2021-05-16 | 2021-05-17 |
| 1 | A | q1-2021 | pres | They just revealed what they believe are the t... | 0.0 | 0.826 | 0.174 | 0.8122 | 154418 | Agilent Technologies | ... | 2021-05-08 | 2021-05-09 | 2021-05-10 | 2021-05-11 | 2021-05-12 | 2021-05-13 | 2021-05-14 | 2021-05-15 | 2021-05-16 | 2021-05-17 |
| 2 | A | q1-2019 | pres | Thank you. And welcome, everyone, to Agilent's... | 0.0 | 0.924 | 0.076 | 0.6705 | 153000 | Agilent Technologies | ... | 2019-05-13 | 2019-05-14 | 2019-05-15 | 2019-05-16 | 2019-05-17 | 2019-05-18 | 2019-05-19 | 2019-05-20 | 2019-05-21 | 2019-05-22 |
3 rows × 193 columns
import nltk
Create some functions to help with analysis:
# I want to pull just the POS from the POS tuple:
def get_pos(col):
return [i[1] for i in col if True]
# count if POS is in POS list; divide by word count; I don't really use this, but normalize later
def pos_count(l, pos = ['JJ', 'JJR', 'JJS']): # default to adjectives
try:
return len([i for i in l if i in pos])/len([p for p in l if p not in [',', '.', '?', ':', '-', ';', '!']])
except:
return 0
def pos_tot(l, pos = ['JJ', 'JJR', 'JJS']):
return len([i for i in l if i in pos])
#nltk.help.upenn_tagset()
Run VADER on each transition prior to aggregating.
df_transition['tokens'] = df_transition['transcript'].apply(nltk.word_tokenize) # tokenize transcript line into a new column
df_transition['part_os'] = df_transition['tokens'].apply(nltk.pos_tag).apply(get_pos) # apply the POS tagging
df_transition.head()
| ticker | quarter | qa | transcript | neg | neu | pos | compound | Unnamed: 0 | co_clean | ... | date_83 | date_84 | date_85 | date_86 | date_87 | date_88 | date_89 | date_90 | tokens | part_os | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | A | q1-2021 | pres | Good afternoon and welcome to the Agilent Tech... | 0.0 | 0.826 | 0.174 | 0.8689 | 154418 | Agilent Technologies | ... | 2021-05-10 | 2021-05-11 | 2021-05-12 | 2021-05-13 | 2021-05-14 | 2021-05-15 | 2021-05-16 | 2021-05-17 | [Good, afternoon, and, welcome, to, the, Agile... | [JJ, NN, CC, NN, TO, DT, NNP, NNPS, NNP, NNP, ... |
| 1 | A | q1-2021 | pres | They just revealed what they believe are the t... | 0.0 | 0.826 | 0.174 | 0.8122 | 154418 | Agilent Technologies | ... | 2021-05-10 | 2021-05-11 | 2021-05-12 | 2021-05-13 | 2021-05-14 | 2021-05-15 | 2021-05-16 | 2021-05-17 | [They, just, revealed, what, they, believe, ar... | [PRP, RB, VBD, WP, PRP, VBP, VBP, DT, JJ, JJS,... |
| 2 | A | q1-2019 | pres | Thank you. And welcome, everyone, to Agilent's... | 0.0 | 0.924 | 0.076 | 0.6705 | 153000 | Agilent Technologies | ... | 2019-05-15 | 2019-05-16 | 2019-05-17 | 2019-05-18 | 2019-05-19 | 2019-05-20 | 2019-05-21 | 2019-05-22 | [Thank, you, ., And, welcome, ,, everyone, ,, ... | [NNP, PRP, ., CC, JJ, ,, NN, ,, TO, NNP, POS, ... |
| 3 | A | q1-2019 | pres | You can find the press release, investor prese... | 0.0 | 1.000 | 0.000 | 0.0000 | 153000 | Agilent Technologies | ... | 2019-05-15 | 2019-05-16 | 2019-05-17 | 2019-05-18 | 2019-05-19 | 2019-05-20 | 2019-05-21 | 2019-05-22 | [You, can, find, the, press, release, ,, inves... | [PRP, MD, VB, DT, NN, NN, ,, NN, NN, ,, CC, NN... |
| 4 | A | q1-2019 | pres | They just revealed what they believe are the t... | 0.0 | 0.826 | 0.174 | 0.8122 | 153000 | Agilent Technologies | ... | 2019-05-15 | 2019-05-16 | 2019-05-17 | 2019-05-18 | 2019-05-19 | 2019-05-20 | 2019-05-21 | 2019-05-22 | [They, just, revealed, what, they, believe, ar... | [PRP, RB, VBD, WP, PRP, VBP, VBP, DT, JJ, JJS,... |
5 rows × 195 columns
Add features for modeling based on POS tagging (divided by word count):
df_transition['adj_count'] = df_transition['part_os'].apply(pos_tot, args = (['JJ']))
df_transition['superlatives_count'] = df_transition['part_os'].apply(pos_tot, args = (['JJS', 'RBS'],))
df_transition['modal_count'] = df_transition['part_os'].apply(pos_tot, args = (['MD']))
df_transition['verb_past_count'] = df_transition['part_os'].apply(pos_tot, args = (['VBD', 'VBN'],))
df_transition['verb_pres_count'] = df_transition['part_os'].apply(pos_tot, args = (['VBZ', 'VBG', 'VBP'],))
df_transition.columns
Index(['ticker', 'quarter', 'qa', 'transcript', 'neg', 'neu', 'pos',
'compound', 'Unnamed: 0', 'co_clean',
...
'date_88', 'date_89', 'date_90', 'tokens', 'part_os', 'adj_count',
'superlatives_count', 'modal_count', 'verb_past_count',
'verb_pres_count'],
dtype='object', length=200)
I want to drop the excess date columns as they are not necessary. I do this with list comprehension.
df_transition.drop(columns = [i for i in df_transition.columns if 'date_' in i]).head(5)
| ticker | quarter | qa | transcript | neg | neu | pos | compound | Unnamed: 0 | co_clean | ... | close_88 | close_89 | close_90 | tokens | part_os | adj_count | superlatives_count | modal_count | verb_past_count | verb_pres_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | A | q1-2021 | pres | Good afternoon and welcome to the Agilent Tech... | 0.0 | 0.826 | 0.174 | 0.8689 | 154418 | Agilent Technologies | ... | NaN | NaN | 129.939209 | [Good, afternoon, and, welcome, to, the, Agile... | [JJ, NN, CC, NN, TO, DT, NNP, NNPS, NNP, NNP, ... | 2 | 0 | 2 | 2 | 1 |
| 1 | A | q1-2021 | pres | They just revealed what they believe are the t... | 0.0 | 0.826 | 0.174 | 0.8122 | 154418 | Agilent Technologies | ... | NaN | NaN | 129.939209 | [They, just, revealed, what, they, believe, ar... | [PRP, RB, VBD, WP, PRP, VBP, VBP, DT, JJ, JJS,... | 3 | 1 | 0 | 2 | 5 |
| 2 | A | q1-2019 | pres | Thank you. And welcome, everyone, to Agilent's... | 0.0 | 0.924 | 0.076 | 0.6705 | 153000 | Agilent Technologies | ... | 67.742966 | 67.143463 | 66.799500 | [Thank, you, ., And, welcome, ,, everyone, ,, ... | [NNP, PRP, ., CC, JJ, ,, NN, ,, TO, NNP, POS, ... | 4 | 0 | 1 | 0 | 2 |
| 3 | A | q1-2019 | pres | You can find the press release, investor prese... | 0.0 | 1.000 | 0.000 | 0.0000 | 153000 | Agilent Technologies | ... | 67.742966 | 67.143463 | 66.799500 | [You, can, find, the, press, release, ,, inves... | [PRP, MD, VB, DT, NN, NN, ,, NN, NN, ,, CC, NN... | 6 | 1 | 3 | 1 | 1 |
| 4 | A | q1-2019 | pres | They just revealed what they believe are the t... | 0.0 | 0.826 | 0.174 | 0.8122 | 153000 | Agilent Technologies | ... | 67.742966 | 67.143463 | 66.799500 | [They, just, revealed, what, they, believe, ar... | [PRP, RB, VBD, WP, PRP, VBP, VBP, DT, JJ, JJS,... | 3 | 1 | 0 | 2 | 5 |
5 rows × 110 columns
df_transition.columns
Index(['ticker', 'quarter', 'qa', 'transcript', 'neg', 'neu', 'pos',
'compound', 'Unnamed: 0', 'co_clean',
...
'date_88', 'date_89', 'date_90', 'tokens', 'part_os', 'adj_count',
'superlatives_count', 'modal_count', 'verb_past_count',
'verb_pres_count'],
dtype='object', length=200)
Create dictionary for aggregating:
close_dict = {}
for i in close_list:
close_dict[i] = 'min'
agg_dict = {'neg':'sum', 'neu':'sum', 'pos':'sum', 'compound':'sum', 'close_0':'min', 'adj_count':'sum',
'superlatives_count':'sum', 'modal_count':'sum', 'verb_past_count':'sum', 'verb_pres_count':'sum', 'market_cap':'min'}
agg_dict.update(close_dict)
df_transition.columns
Index(['ticker', 'quarter', 'qa', 'transcript', 'neg', 'neu', 'pos',
'compound', 'Unnamed: 0', 'co_clean',
...
'date_88', 'date_89', 'date_90', 'tokens', 'part_os', 'adj_count',
'superlatives_count', 'modal_count', 'verb_past_count',
'verb_pres_count'],
dtype='object', length=200)
df_map.columns
Index(['Unnamed: 0', 'ticker', 'qa', 'quarter', 'co_clean', 'date', 'close_0',
'market_cap', 'close_1', 'close_2',
...
'date_81', 'date_82', 'date_83', 'date_84', 'date_85', 'date_86',
'date_87', 'date_88', 'date_89', 'date_90'],
dtype='object', length=188)
# create DF with aggregate data
df_tot = df_transition.groupby(['ticker', 'quarter', 'co_clean']).agg(agg_dict).reset_index()#.drop(columns = 'part_os')
df_t_agg = df_transition.groupby(['ticker', 'quarter'])['transcript'].apply(lambda x: ' '.join(x)).reset_index()
df_full = df_tot.merge(df_t_agg, on = ['ticker', 'quarter']) # merge DFs
df_agg = df_transition.groupby(['ticker', 'quarter', 'qa', 'co_clean']).agg(agg_dict).reset_index()
df_t = df_transition.groupby(['ticker', 'quarter', 'qa'])['transcript'].apply(lambda x: ' '.join(x)).reset_index()
df_call = df_agg.merge(df_t, on = ['ticker', 'quarter', 'qa'])
df_call.head(3)
| ticker | quarter | qa | co_clean | neg | neu | pos | compound | close_0 | adj_count | ... | close_82 | close_83 | close_84 | close_85 | close_86 | close_87 | close_88 | close_89 | close_90 | transcript | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | A | q1-2019 | pres | Agilent Technologies | 0.938 | 34.897 | 6.166 | 26.3781 | 76.392479 | 266 | ... | 74.701042 | 66.475182 | 68.106598 | 67.919861 | NaN | NaN | 67.742966 | 67.143463 | 66.799500 | Thank you. And welcome, everyone, to Agilent's... |
| 1 | A | q1-2019 | qa | Agilent Technologies | 1.919 | 152.969 | 40.114 | 101.4719 | 76.392479 | 556 | ... | 74.701042 | 66.475182 | 68.106598 | 67.919861 | NaN | NaN | 67.742966 | 67.143463 | 66.799500 | All right. Duration: 69 minutes Yes. Hey, Patr... |
| 2 | A | q1-2020 | pres | Agilent Technologies | 0.273 | 16.009 | 2.717 | 15.0561 | 83.869682 | 252 | ... | NaN | 80.592461 | 79.184296 | 79.491722 | 80.959381 | 80.90979 | NaN | NaN | 82.962555 | Thank you, Jillian. Welcome everyone to Agilen... |
3 rows × 106 columns
Get relative ratio of the different POS counts by dividing by word count:
for i in ['neg', 'neu', 'pos', 'compound', 'adj_count', 'superlatives_count', 'modal_count', 'verb_past_count', 'verb_pres_count']:
df_call[i] = df_call[i]/len(df_call.transcript.str.split(' '))
for i in ['neg', 'neu', 'pos', 'compound', 'adj_count', 'superlatives_count', 'modal_count', 'verb_past_count', 'verb_pres_count']:
df_full[i] = df_full[i]/len(df_full.transcript.str.split(' '))
Add word count column to be used as a feature variable in the models:
df_call['word_count'] = [len(x.split()) for x in df_call['transcript'].tolist()]
df_full['word_count'] = [len(x.split()) for x in df_full['transcript'].tolist()]
df_qa = df_call[df_call['qa'] == 'qa']
df_qa[['ticker', 'quarter', 'word_count', 'compound']]
| ticker | quarter | word_count | compound | |
|---|---|---|---|---|
| 1 | A | q1-2019 | 8365 | 0.027581 |
| 3 | A | q1-2020 | 7041 | 0.015482 |
| 5 | A | q1-2021 | 7458 | 0.019954 |
| 7 | A | q2-2019 | 8162 | 0.023568 |
| 9 | A | q2-2020 | 6334 | 0.015798 |
| ... | ... | ... | ... | ... |
| 3670 | ZTS | q3-2019 | 12094 | 0.017119 |
| 3672 | ZTS | q3-2020 | 11324 | 0.009041 |
| 3674 | ZTS | q4-2018 | 9110 | 0.013237 |
| 3676 | ZTS | q4-2019 | 13696 | 0.017243 |
| 3678 | ZTS | q4-2020 | 15458 | 0.021915 |
1838 rows × 4 columns
df_qa[df_qa['word_count'].isnull()]
| ticker | quarter | qa | co_clean | neg | neu | pos | compound | close_0 | adj_count | ... | close_83 | close_84 | close_85 | close_86 | close_87 | close_88 | close_89 | close_90 | transcript | word_count |
|---|
0 rows × 107 columns
Get Q&A word count into the DF
df_full['qa_wc'] = df_full.merge(df_qa, on = ['ticker', 'quarter'])['word_count_y']
df_full['qa_comp'] = df_full.merge(df_qa, on = ['ticker', 'quarter'])['compound_y']
df_full['qa_pos'] = df_full.merge(df_qa, on = ['ticker', 'quarter'])['pos_y']
df_call.to_csv('../proj_data/output_data/model_vars.csv')
df_full.to_csv('../proj_data/output_data/model_vars_noQA.csv')
Purpose: Use feature variables created in previous files based on text transcripts to predict a stock's price movement.
import sklearn.datasets as datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly_express as px
from sklearn.linear_model import LinearRegression, Lasso, BayesianRidge, ElasticNetCV, RidgeCV, LassoCV, SGDRegressor, Ridge
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, RepeatedKFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
LIB = pd.read_csv('../proj_data/output_data/LIB.csv')
I use the file that is not split by Q&A for the modeling. I have word count by Q&A as a variable, which I think is important as a feature from the Q&A work, but I am not sure the file is too helpful as each of the prices are duplicated for that file, which may hurt the predictive power of the resulting model.
df = pd.read_csv('../proj_data/output_data/model_vars_noQA.csv').drop(columns = 'Unnamed: 0')
df.dropna(subset=['word_count', 'qa_wc'], inplace = True) # 3 lines have NA values for some reason
df.shape[0]
1838
extras = LIB[['ticker', 'sector', 'sub_sector']].drop_duplicates()
features = ['neg', 'neu', 'pos', 'compound', 'close_0', 'adj_count',
'superlatives_count', 'modal_count', 'verb_past_count',
'verb_pres_count', 'sector', 'sub_sector', 'market_cap', 'qa_wc', 'word_count', 'qa_pos']
df = df[~df['close_0'].isnull()]#.any(axis=1)] # drop where the closing price as of the call is null
df.shape[0] # see that we still have plenty of calls
1822
df = df.merge(extras, on = 'ticker') # pull in extra data for graphing
df.sub_sector.isnull().sum()
0
I one-hot encode the categorical data (i.e. the sectors):
df_ohe = pd.get_dummies(df[features])#[['qa_pres', 'qa_qa', 'sector', 'sub_sector']]
ohe_feats = [i for i in df_ohe.columns if i.startswith('sector')]
ohe_feats
['sector_Communication Services', 'sector_Consumer Discretionary', 'sector_Consumer Staples', 'sector_Energy', 'sector_Financials', 'sector_Health Care', 'sector_Industrials', 'sector_Information Technology', 'sector_Materials', 'sector_Real Estate', 'sector_Utilities']
df_ohe = df_ohe[ohe_feats]
df_all = df
df = df.merge(df_ohe, left_index = True, right_index = True).drop(columns = ['sub_sector', 'sector'])
features.remove('sector')
features.remove('sub_sector')
See quantitative variables:
features
['neg', 'neu', 'pos', 'compound', 'close_0', 'adj_count', 'superlatives_count', 'modal_count', 'verb_past_count', 'verb_pres_count', 'market_cap', 'qa_wc', 'word_count', 'qa_pos']
df['start_price'] = df['close_0'] # rename close_0 to start_price
df[['qa_wc', 'word_count', 'qa_comp']].head() # see the total word count and just the Q&A word count
| qa_wc | word_count | qa_comp | |
|---|---|---|---|
| 0 | 8365.0 | 11031 | 0.027581 |
| 1 | 7041.0 | 10064 | 0.015482 |
| 2 | 7458.0 | 10170 | 0.019954 |
| 3 | 8162.0 | 10646 | 0.023568 |
| 4 | 6334.0 | 9747 | 0.015798 |
The features are at many different scales, so I want to normalize to avoid potential issues in the modeling:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])
Add percentage change, which I also want to model. I do this with a loop to calc relative change for each 7-day period from the starting date. The percentage change is now a predictor in addition to the closing price +N weeks.
p_list = []
for i in range(1, 91):
s = i
df['perc_'+str(s)] = df['close_'+str(s)]/df['start_price']-1
p_list.append('perc_'+str(s))
Bring back the OHE and quantitative vars (i.e. POS, word counts, sentiment):
features = features + ohe_feats
Index the DF to get the model DF (with all variables, both features and predictors):
df_mod = df[features + close_list + p_list + ['start_price']]
df_mod[df_mod['qa_wc'].isna()] # check for any NAs
| neg | neu | pos | compound | close_0 | adj_count | superlatives_count | modal_count | verb_past_count | verb_pres_count | ... | perc_82 | perc_83 | perc_84 | perc_85 | perc_86 | perc_87 | perc_88 | perc_89 | perc_90 | start_price |
|---|
0 rows × 206 columns
df_mod.head()
| neg | neu | pos | compound | close_0 | adj_count | superlatives_count | modal_count | verb_past_count | verb_pres_count | ... | perc_82 | perc_83 | perc_84 | perc_85 | perc_86 | perc_87 | perc_88 | perc_89 | perc_90 | start_price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.377345 | 1.261148 | 1.305921 | 0.944483 | -0.394863 | -0.052383 | -0.195670 | -0.074228 | -0.049089 | -0.006603 | ... | -0.022141 | -0.129820 | -0.108465 | -0.110909 | NaN | NaN | -0.113225 | -0.121072 | -0.125575 | 76.392479 |
| 1 | -0.402315 | -0.189741 | 0.063311 | -0.395799 | -0.368238 | -0.307435 | -0.405712 | -0.377771 | -0.100571 | -0.158068 | ... | NaN | -0.039075 | -0.055865 | -0.052200 | -0.034700 | -0.035292 | NaN | NaN | -0.010816 | 83.869682 |
| 2 | -0.497047 | 0.074105 | 0.384261 | 0.152072 | -0.212564 | -0.311359 | -0.195670 | -0.325436 | 0.041995 | -0.048333 | ... | NaN | 0.032205 | 0.028683 | 0.006923 | 0.017725 | 0.026569 | NaN | NaN | 0.018429 | 127.587898 |
| 3 | 0.034505 | 0.760039 | 1.499112 | 0.576695 | -0.430177 | -0.246615 | -0.965823 | -0.398705 | 0.228123 | 0.069130 | ... | -0.024173 | 0.002498 | 0.007091 | 0.029612 | 0.017018 | NaN | NaN | -0.012764 | 0.002201 | 66.475182 |
| 4 | 0.665300 | 0.173400 | 0.412757 | -0.092176 | -0.366807 | -0.403569 | -0.615753 | -0.116096 | -0.084731 | -0.175069 | ... | 0.151271 | 0.164714 | 0.148205 | NaN | NaN | 0.149856 | 0.153983 | 0.155162 | 0.153040 | 84.271545 |
5 rows × 206 columns
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_mod, df_mod, test_size=0.30, random_state=24)
lm = LinearRegression()
Add the company info to the model-variables DF:
df_all = df_all[['ticker', 'quarter', 'co_clean', 'sector', 'sub_sector']].merge(df_mod, left_index = True, right_index = True)
features # all feature variables
['neg', 'neu', 'pos', 'compound', 'close_0', 'adj_count', 'superlatives_count', 'modal_count', 'verb_past_count', 'verb_pres_count', 'market_cap', 'qa_wc', 'word_count', 'qa_pos', 'sector_Communication Services', 'sector_Consumer Discretionary', 'sector_Consumer Staples', 'sector_Energy', 'sector_Financials', 'sector_Health Care', 'sector_Industrials', 'sector_Information Technology', 'sector_Materials', 'sector_Real Estate', 'sector_Utilities']
Create function to return two dataframes: one containing the test DF with predictions as well as the training DF with predictions. DFs contain other statistics and metrics to help determine model performance. Input metric can be 'close' for the closing price at day N (multiples of 7 days) or 'per' for the percentage change. These metrics represent the predictor variables. Can also pass the model into the function.
Setting test_train = True returns one DF with a test/train flag.
from sklearn.metrics import r2_score
def model_(days, metric = 'close', model = LinearRegression(), test_train = False):
feats = features.copy()
met_days = metric +'_'+ str(days)
df_train = df_all.filter(items = x_train.index, axis = 0)
df_train = df_train[~df_train[met_days].isnull()] # removes NAs for the date so there's no error when fitting
df_test = df_all.filter(items = x_test.index, axis = 0)
df_test = df_test[~df_test[met_days].isnull()] # remove NAs for same reason
if 'perc' in metric: # allows for predicting percentage change, but I don't trust this as much
feats.remove('close_0') # i don't use close 0 with percentage change
preds = df_train[feats]#.drop(columns = 'close_0')#x_train[~x_train[close].isnull()][features]
else:
preds = df_train[feats]#x_train[~x_train[close].isnull()][features]
y = df_train[met_days]
m = model.fit(preds, y)
df_test['preds'] = m.predict(x_test[~x_test[met_days].isnull()][feats]) # index for features
df_test['perc_diff'] = round((df_test['preds']/df_test['start_price']-1)*100, 3) # calc predicted % change
df_test['perc_gain'] = round(100*(df_test['close_'+str(days)]/df_test['start_price']-1), 3) # calc actual % change
if metric == 'close': # create some error/loss metrics
df_test['err'] = df_test['preds'] - df_test[met_days]
df_test['rel_err'] = df_test['perc_diff']/100 - df_test['perc_gain']/100
else:
df_test['err'] = df_test['preds'] - df_test['perc_gain']/100
df_test['rel_err'] = df_test['err']
df_test['mse'] = np.mean(df_test['err']**2) # more loss functions
df_test['rel_mse'] = np.mean(df_test['rel_err']**2) # loss function I use mostly
df_train['preds'] = m.predict(df_train[~df_train[met_days].isnull()][feats]) # predict on training as well
df_train['perc_diff'] = round((df_train['preds']/df_train['start_price']-1)*100, 3)
df_train['perc_gain'] = round(100*(df_train['close_'+str(days)]/df_train['start_price']-1), 3)
if metric == 'close': # get same metrics on the training data
df_train['err'] = df_train['preds'] - df_train[met_days]
df_train['rel_err'] = df_train['perc_diff']/100 - df_train['perc_gain']/100
else:
df_train['err'] = df_train['preds'] - df_train['perc_gain']/100
df_train['rel_err'] = df_train['err']
df_train['mse'] = np.mean(df_train['err']**2)
df_train['rel_mse'] = np.mean(df_train['rel_err']**2)
if test_train:
df_test['test_train'] = 'test'
df_train['test_train'] = 'train'
# return either the appended DF with a test-train split and R2 for the testing and training data...
return df_test.append(df_train), r2_score(x_test[~x_test[met_days].isnull()][met_days], df_test['preds']), r2_score(y, df_train['preds'])
# or return separate DFs for test and train with their R2
else:
return df_test, df_train, r2_score(x_test[~x_test[met_days].isnull()][met_days], df_test['preds']), r2_score(y, df_train['preds'])
Show MSE for model on test and training data:
#min(t1t2[t1t2['test_train']== 'train']['rel_mse'])
#min(t1t2[t1t2['test_train']== 'test']['rel_mse'])
l = list(range(1, 91))
d = {}
Produce Some CV models:
grid = dict()
grid['alpha'] = np.arange(0, 5, 1)
# define search
ridgecv = GridSearchCV(Ridge(), grid, scoring='neg_mean_squared_error', cv=RepeatedKFold(n_splits=20, n_repeats=3, random_state=1), n_jobs=-1)
grid = dict()
grid['alpha'] = np.arange(0, 1, 0.05)
# define search
lassocv = GridSearchCV(Lasso(), grid, scoring='neg_mean_absolute_error', cv=RepeatedKFold(n_splits=20, n_repeats=3, random_state=1), n_jobs=-1)
grid = dict()
#grid['alpha'] = np.arange(0, 10, 1) # I kept these in to change around. It's really slow with some in for little predictive gain
#grid['gamma'] = np.arange(0, 10, 1)
grid['degree'] = np.arange(1, 5, 1)
#grid['min_samples_split'] = np.arange(1, 5, 1)
# define search
kridgecv = GridSearchCV(KernelRidge(kernel = 'poly'), grid, scoring='neg_mean_squared_error', cv=RepeatedKFold(n_splits=10, n_repeats=3, random_state=1), n_jobs=-1)
grid = dict()
grid['alpha'] = np.arange(0, 1, 0.05)
#grid['gamma'] = np.arange(0, 10, 1)
#grid['degree'] = np.arange(1, 5, 1)
#grid['min_samples_split'] = np.arange(1, 5, 1)
# define search
kridgecvlin = GridSearchCV(KernelRidge(kernel = 'linear'), grid, scoring='neg_mean_squared_error', cv=RepeatedKFold(n_splits=10, n_repeats=3, random_state=1), n_jobs=-1)
Test function:
As an example, return the two DFs with predictors. We can see the error for both test and train, which can help determine if there is overfitting.
t1t2, r2_test, r2_train = model_(90,metric = 'close', test_train = True, model = lassocv) #lasso CV model created above
r2_test
0.9817998217070911
r2_train
0.985264663294468
R^2 are similar, which is good.
t1t2
| ticker | quarter | co_clean | sector | sub_sector | neg | neu | pos | compound | close_0 | ... | perc_90 | start_price | preds | perc_diff | perc_gain | err | rel_err | mse | rel_mse | test_train | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 709 | FCX | q2-2020 | Freeport-McMoRan | Materials | Copper | 0.205173 | -0.332245 | -0.380739 | 0.104899 | -0.619660 | ... | 0.330330 | 13.262485 | 12.146396 | -8.415 | 33.033 | -5.497088 | -0.41448 | 1084.053444 | 0.029502 | test |
| 366 | CCI | q4-2020 | Crown Castle | Real Estate | Specialized REITs | -0.759440 | 0.091696 | -0.124883 | 0.021169 | -0.112324 | ... | 0.181212 | 155.738266 | 164.465932 | 5.604 | 18.121 | -19.494029 | -0.12517 | 1084.053444 | 0.029502 | test |
| 547 | DHR | q1-2021 | Danaher Corporation | Health Care | Health Care Equipment | -1.017322 | -0.897663 | -0.765973 | -0.609713 | 0.233022 | ... | 0.107144 | 252.722290 | 267.603840 | 5.888 | 10.714 | -12.196147 | -0.04826 | 1084.053444 | 0.029502 | test |
| 67 | ADP | q1-2021 | Automatic Data Processing | Information Technology | Data Processing & Outsourced Services | 1.983279 | 0.102227 | -0.205171 | -0.073258 | -0.122992 | ... | 0.066643 | 152.742340 | 164.104638 | 7.439 | 6.664 | 1.183160 | 0.00775 | 1084.053444 | 0.029502 | test |
| 1499 | SBUX | q1-2021 | Starbucks | Consumer Discretionary | Restaurants | -1.045140 | -0.370523 | -0.610124 | 0.058412 | -0.321238 | ... | 0.191831 | 97.068893 | 103.296029 | 6.415 | 19.183 | -12.393668 | -0.12768 | 1084.053444 | 0.029502 | test |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1425 | PYPL | q4-2019 | PayPal | Information Technology | Data Processing & Outsourced Services | -1.351140 | -1.262126 | -1.243313 | -0.948794 | -0.251476 | ... | -0.004457 | 116.660004 | 125.634367 | 7.693 | -0.446 | 9.494368 | 0.08139 | 1541.716337 | 0.032426 | train |
| 343 | C | q3-2019 | Citigroup | Financials | Diversified Banks | -0.219618 | 0.569081 | 0.960324 | 0.810432 | -0.430318 | ... | 0.140501 | 66.435608 | 72.291000 | 8.814 | 14.050 | -3.478867 | -0.05236 | 1541.716337 | 0.032426 | train |
| 192 | ANTM | q4-2019 | Anthem | Health Care | Managed Health Care | -0.074512 | 0.890927 | 0.790631 | 1.514701 | 0.286039 | ... | -0.013528 | 267.611023 | 279.858032 | 4.576 | -1.353 | 15.867126 | 0.05929 | 1541.716337 | 0.032426 | train |
| 899 | ILMN | q4-2019 | Illumina | Health Care | Life Sciences Tools & Services | 0.209684 | 0.046831 | -1.003799 | -0.620616 | 0.448731 | ... | 0.001564 | 313.299988 | 330.508135 | 5.493 | 0.156 | 16.718126 | 0.05337 | 1541.716337 | 0.032426 | train |
| 418 | CMG | q1-2021 | Chipotle Mexican Grill | Consumer Discretionary | Restaurants | -0.955671 | -0.457472 | 0.017529 | -0.001590 | 4.701536 | ... | 0.044262 | 1507.619995 | 1578.805261 | 4.722 | 4.426 | 4.455285 | 0.00296 | 1541.716337 | 0.032426 | train |
1693 rows × 219 columns
Plot actual percentage gain against market cap. I show test-train split to make sure the data don't seem overfit. This is for call date +90 days.
fig = px.scatter(data_frame = t1t2, x = 'market_cap', y = 'perc_gain', color = 'test_train',height=800,
hover_name= 'ticker')#, hover_data=["err", "perc_diff", 'quarter'])#, marginal_x='box', marginal_y='box', height=800)
fig.show()
Show the rel_err from the model (predicted gain less actual) by market cap. Notice more volatility at lower MCs. Call date +90
fig = px.scatter(data_frame = t1t2, x = 'preds', y = 'rel_err', color = 'test_train',height=800,
hover_name= 'ticker')#, hover_data=["err", "perc_diff", 'quarter'])#, marginal_x='box', marginal_y='box', height=800)
fig.show()
Last initial viz by sector and rel_err vs market cap. Same call date +90.
fig = px.scatter(data_frame = t1t2, x = 'market_cap', y = 'rel_err', color = 'sector',height=800,
hover_name= 'ticker')#, hover_data=["err", "perc_diff", 'quarter'])#, marginal_x='box', marginal_y='box', height=800)
fig.show()
# dictionary of all models. I loop through to determine and compare model performance
modlist = {KernelRidge(): 'SKLearn Kernel Ridge', RidgeCV(): 'SKLearn Ridge CV',
LinearRegression(): 'SKLearn Lin Reg', kridgecvlin: 'GridSearch Lin Kernel Ridge CV',
ElasticNetCV(): 'SKLean Elastic Net CV', SVR(): 'SKLearn Support Vect. Reg.', lassocv: 'GridSearch Lasso CV',
LassoCV(): 'SKLearn Lasso CV', kridgecv: 'GridSearch Poly Kernel Ridge CV'}
# these are not helpful
# ridgecv: 'GridSearch Ridge CV', lassocv: 'GridSearch Lasso CV', RandomForestRegressor():'RF',GradientBoostingRegressor(): 'GBReg'
Model Performance Analysis I want to see, out of all the models, which performs the best. I run all models 90 times (once per number of days out) to predict closing price. I then group the output and analyze.
output_df = pd.DataFrame()
op_dict = {}
op_dict_train = {}
for i in l:
for m in modlist:
for k in ['close']:
tmp, r2_test, r2_train = model_(days = i, metric = k, model = m, test_train = True)
tmp['days'] = i
tmp['predictor'] = k
tmp['model'] = modlist[m] # removed days from groupby
tmp['act'] = tmp[k+'_'+str(i)]
tmp = tmp[['sector', 'test_train', 'market_cap', 'err', 'predictor', 'model', 'rel_err', 'mse', 'preds', 'act', 'days']]
output_df = output_df.append(tmp)
op_dict[(modlist[m], i)] = r2_test # save R2 values for lookup
op_dict_train[(modlist[m], i)] = r2_train
#output_df.to_csv('output_test3.csv')
output_df['key'] = list(zip(output_df.model, output_df.days)) # will merge R2 to this by key
#output_df
Lookup R2 values for test and train (these are output from the model functions, which I fed into dictionaries).
output_df['r2_test'] = output_df['key'].map(op_dict)
output_df['r2_train'] = output_df['key'].map(op_dict_train)
output_df['r2'] = np.where(output_df['test_train'] == 'train', output_df['r2_train'], output_df['r2_test'])
FULL METRICS TABLE. One row per model per day post call (1-90 days)
output_df
| sector | test_train | market_cap | err | predictor | model | rel_err | mse | preds | act | days | key | r2_test | r2_train | r2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 709 | Materials | test | -0.374558 | -6.076836 | close | SKLearn Kernel Ridge | -0.45819 | 227.298423 | 6.986512 | 13.063348 | 1 | (SKLearn Kernel Ridge, 1) | 0.995554 | 0.997756 | 0.995554 |
| 1571 | Utilities | test | -0.423201 | -6.393997 | close | SKLearn Kernel Ridge | -0.05833 | 227.298423 | 104.556649 | 110.950645 | 1 | (SKLearn Kernel Ridge, 1) | 0.995554 | 0.997756 | 0.995554 |
| 366 | Real Estate | test | -0.290335 | -3.297193 | close | SKLearn Kernel Ridge | -0.02118 | 227.298423 | 153.553957 | 156.851151 | 1 | (SKLearn Kernel Ridge, 1) | 0.995554 | 0.997756 | 0.995554 |
| 547 | Health Care | test | 0.143196 | -5.249842 | close | SKLearn Kernel Ridge | -0.02077 | 227.298423 | 254.327093 | 259.576935 | 1 | (SKLearn Kernel Ridge, 1) | 0.995554 | 0.997756 | 0.995554 |
| 67 | Information Technology | test | -0.267579 | -6.672264 | close | SKLearn Kernel Ridge | -0.04368 | 227.298423 | 149.133202 | 155.805466 | 1 | (SKLearn Kernel Ridge, 1) | 0.995554 | 0.997756 | 0.995554 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1425 | Information Technology | train | 0.447445 | 10.711854 | close | GridSearch Poly Kernel Ridge CV | 0.09182 | 1546.601591 | 126.851854 | 116.139999 | 90 | (GridSearch Poly Kernel Ridge CV, 90) | 0.978788 | 0.985218 | 0.985218 |
| 343 | Financials | train | -0.084460 | -1.054369 | close | GridSearch Poly Kernel Ridge CV | -0.01587 | 1546.601591 | 74.715498 | 75.769867 | 90 | (GridSearch Poly Kernel Ridge CV, 90) | 0.978788 | 0.985218 | 0.985218 |
| 192 | Health Care | train | -0.271549 | 10.228026 | close | GridSearch Poly Kernel Ridge CV | 0.03822 | 1546.601591 | 274.218932 | 263.990906 | 90 | (GridSearch Poly Kernel Ridge CV, 90) | 0.978788 | 0.985218 | 0.985218 |
| 899 | Health Care | train | -0.318111 | 13.051707 | close | GridSearch Poly Kernel Ridge CV | 0.04166 | 1546.601591 | 326.841716 | 313.790009 | 90 | (GridSearch Poly Kernel Ridge CV, 90) | 0.978788 | 0.985218 | 0.985218 |
| 418 | Consumer Discretionary | train | -0.385416 | -14.647028 | close | GridSearch Poly Kernel Ridge CV | -0.00971 | 1546.601591 | 1559.702948 | 1574.349976 | 90 | (GridSearch Poly Kernel Ridge CV, 90) | 0.978788 | 0.985218 | 0.985218 |
1008108 rows × 15 columns
output_df.to_csv('../proj_data/output_data/model_performance_ALL.csv')
output_grouped = output_df.sort_values(['predictor','sector','test_train','rel_err', 'r2'],ascending=True)\
.groupby(['sector', 'test_train', 'predictor', 'model'])\
.agg({'rel_err': (lambda x: np.mean(x**2)), 'r2':'mean'}) # my loss function; MSrelativeErr
Top 2 models by sector and test-train split, sorted by the MSrelE metric:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
display(output_grouped.reset_index().sort_values(['predictor','sector','test_train','rel_err', 'r2'],ascending=True).groupby(['sector', 'test_train', 'predictor']).head(2))
| sector | test_train | predictor | model | rel_err | r2 | |
|---|---|---|---|---|---|---|
| 0 | Communication Services | test | close | GridSearch Lasso CV | 0.008700 | 0.984336 |
| 5 | Communication Services | test | close | SKLearn Lasso CV | 0.009793 | 0.984072 |
| 9 | Communication Services | train | close | GridSearch Lasso CV | 0.013616 | 0.989937 |
| 14 | Communication Services | train | close | SKLearn Lasso CV | 0.014678 | 0.989909 |
| 18 | Consumer Discretionary | test | close | GridSearch Lasso CV | 0.045953 | 0.984254 |
| 23 | Consumer Discretionary | test | close | SKLearn Lasso CV | 0.053595 | 0.983990 |
| 27 | Consumer Discretionary | train | close | GridSearch Lasso CV | 0.029450 | 0.989818 |
| 28 | Consumer Discretionary | train | close | GridSearch Lin Kernel Ridge CV | 0.031824 | 0.990048 |
| 36 | Consumer Staples | test | close | GridSearch Lasso CV | 0.004989 | 0.984233 |
| 37 | Consumer Staples | test | close | GridSearch Lin Kernel Ridge CV | 0.005584 | 0.983854 |
| 46 | Consumer Staples | train | close | GridSearch Lin Kernel Ridge CV | 0.008117 | 0.990196 |
| 52 | Consumer Staples | train | close | SKLearn Ridge CV | 0.008181 | 0.990189 |
| 60 | Energy | test | close | SKLearn Lin Reg | 0.062399 | 0.983768 |
| 61 | Energy | test | close | SKLearn Ridge CV | 0.063407 | 0.983700 |
| 63 | Energy | train | close | GridSearch Lasso CV | 0.036504 | 0.990464 |
| 68 | Energy | train | close | SKLearn Lasso CV | 0.037316 | 0.990433 |
| 76 | Financials | test | close | SKLearn Kernel Ridge | 0.024405 | 0.983515 |
| 72 | Financials | test | close | GridSearch Lasso CV | 0.025314 | 0.984214 |
| 85 | Financials | train | close | SKLearn Kernel Ridge | 0.015169 | 0.990057 |
| 81 | Financials | train | close | GridSearch Lasso CV | 0.015620 | 0.989950 |
| 90 | Health Care | test | close | GridSearch Lasso CV | 0.010439 | 0.984214 |
| 95 | Health Care | test | close | SKLearn Lasso CV | 0.010736 | 0.983950 |
| 99 | Health Care | train | close | GridSearch Lasso CV | 0.012858 | 0.989798 |
| 104 | Health Care | train | close | SKLearn Lasso CV | 0.013239 | 0.989769 |
| 108 | Industrials | test | close | GridSearch Lasso CV | 0.024950 | 0.984371 |
| 113 | Industrials | test | close | SKLearn Lasso CV | 0.025682 | 0.984094 |
| 121 | Industrials | train | close | SKLearn Kernel Ridge | 0.055477 | 0.989916 |
| 122 | Industrials | train | close | SKLearn Lasso CV | 0.058305 | 0.989783 |
| 126 | Information Technology | test | close | GridSearch Lasso CV | 0.014038 | 0.984140 |
| 131 | Information Technology | test | close | SKLearn Lasso CV | 0.014078 | 0.983881 |
| 135 | Information Technology | train | close | GridSearch Lasso CV | 0.014343 | 0.989804 |
| 140 | Information Technology | train | close | SKLearn Lasso CV | 0.014552 | 0.989776 |
| 144 | Materials | test | close | GridSearch Lasso CV | 0.058929 | 0.984310 |
| 149 | Materials | test | close | SKLearn Lasso CV | 0.086107 | 0.984060 |
| 153 | Materials | train | close | GridSearch Lasso CV | 0.022904 | 0.990049 |
| 158 | Materials | train | close | SKLearn Lasso CV | 0.029477 | 0.990019 |
| 162 | Real Estate | test | close | GridSearch Lasso CV | 0.007487 | 0.984293 |
| 167 | Real Estate | test | close | SKLearn Lasso CV | 0.007508 | 0.984022 |
| 171 | Real Estate | train | close | GridSearch Lasso CV | 0.008672 | 0.989926 |
| 176 | Real Estate | train | close | SKLearn Lasso CV | 0.008816 | 0.989892 |
| 182 | Utilities | test | close | GridSearch Poly Kernel Ridge CV | 0.006372 | 0.980355 |
| 180 | Utilities | test | close | GridSearch Lasso CV | 0.007771 | 0.983876 |
| 196 | Utilities | train | close | SKLearn Ridge CV | 0.008237 | 0.990179 |
| 195 | Utilities | train | close | SKLearn Lin Reg | 0.008559 | 0.990189 |
Notice that LassoCV model largely performs the best or second best. Use as champion model.
Show some quick distributions to see if there are data oddities. Notice slight imbalance by sector and market caps.
display(model_(7, test_train = True)[0].groupby(['sector', 'test_train'])['ticker'].count())
sector test_train
Communication Services test 37
train 84
Consumer Discretionary test 52
train 114
Consumer Staples test 35
train 85
Energy test 7
train 37
Financials test 72
train 153
Health Care test 106
train 244
Industrials test 78
train 141
Information Technology test 92
train 239
Materials test 25
train 51
Real Estate test 17
train 61
Utilities test 23
train 54
Name: ticker, dtype: int64
import seaborn as sns
sns.boxplot(x = 'sector', y = 'market_cap', data = model_(7, test_train = True)[0])
plt.xticks(rotation=70)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), [Text(0, 0, 'Consumer Staples'), Text(1, 0, 'Materials'), Text(2, 0, 'Utilities'), Text(3, 0, 'Real Estate'), Text(4, 0, 'Health Care'), Text(5, 0, 'Information Technology'), Text(6, 0, 'Consumer Discretionary'), Text(7, 0, 'Industrials'), Text(8, 0, 'Financials'), Text(9, 0, 'Communication Services'), Text(10, 0, 'Energy')])
I made some relatively simple graph functions to take in my info and visualize. They are pretty flexible, especially with the model_ function, which is also flexible.
import plotly.graph_objects as go
def graph_(df, close, color = 'sector', size = None, logs = False, metric = 'close_'):
fig = px.scatter(data_frame = df, x = 'preds', y = metric+str(close), color = color,height=800,
hover_name= 'ticker', hover_data=['start_price', 'err', 'perc_diff', 'quarter', 'perc_gain', 'close_'+str(close)], size = size, opacity = .65,
log_x = logs, log_y = logs)#, marginal_x='box', marginal_y='box', height=800)
fig.update_traces(marker=dict(size=12,
line=dict(width=1,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig3 = go.Figure(data = fig.data + px.line(y = [df[metric+str(close)].min(), df[metric+str(close)].max()],
x = [df[metric+str(close)].min(), df[metric+str(close)].max()],
log_x = logs, log_y = logs).data)
fig3.update_layout(
autosize=False,
width=1400,
height=800,
margin=dict(
l=50,
r=50,
b=30,
t=30,
pad=4
),
#paper_bgcolor="LightSteelBlue",
)
if logs:
fig3.update_layout(xaxis_type = "log", yaxis_type = "log" )
fig3.show()
def graph_2(df, close, color = 'sector', size = None, logs = False, metric = 'close_', x = 'preds', y = 'close_7'):
print('Days', i)
fig = px.scatter(data_frame = df, x = x, y = y, color = color,height=1000,
hover_name= 'ticker', hover_data=['start_price', 'err', 'perc_diff', 'quarter', 'perc_gain'], size = size, opacity = .65,
log_x = logs, log_y = logs)
fig3 = go.Figure(data = fig.data + px.line(y = [df[y].min(), df[y].max()],
x = [df[y].min(), df[y].max()],
log_x = logs, log_y = logs).data)
fig.show()
Loop through list of days out and graph.
for i in l[:2]:
graph_2(model_(metric = 'close', days = i, model = lassocv)[0], i, color = 'sector', logs = False, metric = 'close_', x = 'market_cap', y = 'rel_err')
Days 1
Days 2
The plots below show the predicted close price at day n (printed above the graph) against the actual day n close price. These are for the Lasso CV model, and I show prediction increments of 1, 2, 3, 7, 14, and 90 days. Notice the predictions begin to disperse as time increases; there is more spread around the parity line.
for i in [1, 2, 3, 7, 14, 21, 90]:
print('Days', i)
print('Test R2 =', model_(metric = 'close', days = i, model = lassocv, test_train = True)[1])
graph_(model_(metric = 'close', days = i, model = lassocv, test_train = True)[0], i, color = 'test_train', logs = False, metric = 'close_')#, x = 'perc_diff', y = 'perc_gain')
Days 1 Test R2 = 0.9958236630976603
Days 2 Test R2 = 0.9938261878550942
Days 3 Test R2 = 0.9898124757199774
Days 7 Test R2 = 0.9920102374893159
Days 14 Test R2 = 0.9891399428225021
Days 21 Test R2 = 0.9874865467310366
Days 90 Test R2 = 0.9817998217070911
for i in [1, 2, 3, 7, 14, 21, 90]:
print('Days', i)
print('Test R2 =', model_(metric = 'close', days = i, model = lassocv, test_train = True)[1])
graph_(model_(metric = 'close', days = i, model = lassocv)[0], i, color = 'sector', logs = False, metric = 'close_')
Days 1 Test R2 = 0.9958236630976603
Days 2 Test R2 = 0.9938261878550942
Days 3 Test R2 = 0.9898124757199774
Days 7 Test R2 = 0.9920102374893159
Days 14 Test R2 = 0.9891399428225021
Days 21 Test R2 = 0.9874865467310366
Days 90 Test R2 = 0.9817998217070911
#for i in l[:2]:
# graph_(model_(metric = 'perc', days = i, model = ElasticNetCV())[0], i, color = 'sector', logs = False, metric = 'perc_')#, x = 'perc_diff', y = 'perc_gain')